/* * Copyright (c) 2016 Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* 3. The names of its contributors may not be used to endorse or promote
*    products derived from this software without specific prior written
*    permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* *********************************************************************************************** *
* CARLsim
* created by: (MDR) Micah Richert, (JN) Jayram M. Nageswaran
* maintained by:
* (MA) Mike Avery <averym@uci.edu>
* (MB) Michael Beyeler <mbeyeler@uci.edu>,
* (KDC) Kristofor Carlson <kdcarlso@uci.edu>
* (TSC) Ting-Shuo Chou <tingshuc@uci.edu>
* (HK) Hirak J Kashyap <kashyaph@uci.edu>
*
* CARLsim v1.0: JM, MDR
* CARLsim v2.0/v2.1/v2.2: JM, MDR, MA, MB, KDC
* CARLsim3: MB, KDC, TSC
* CARLsim4: TSC, HK
* CARLsim5: HK, JX, KC
*
* CARLsim available from http://socsci.uci.edu/~jkrichma/CARLsim/
* Ver 12/31/2016
*/

#include <snn.h>

#include <spike_buffer.h>

// spikeGeneratorUpdate_CPU on CPUs
#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::spikeGeneratorUpdate_CPU(int netId) {
#else // POSIX
	void* SNN::spikeGeneratorUpdate_CPU(int netId) {
#endif
	assert(runtimeData[netId].allocated);
	assert(runtimeData[netId].memType == CPU_MEM);

	// FIXME: skip this step if all spike gen neuron are possion neuron (generated by rate)
	// update the random number for poisson spike generator (spikes generated by rate)
	for (int poisN = 0; poisN < networkConfigs[netId].numNPois; poisN++) {
		// set CPU_MODE Random Gen, store random number to g(c)puRandNums
		runtimeData[netId].randNum[poisN] = drand48();
	}

	// Use spike generators (user-defined callback function)
	if (networkConfigs[netId].numNSpikeGen > 0) {
		assert(managerRuntimeData.spikeGenBits != NULL);

		// reset the bit status of the spikeGenBits...
		memset(managerRuntimeData.spikeGenBits, 0, sizeof(int) * (networkConfigs[netId].numNSpikeGen / 32 + 1));

		// fill spikeGenBits from SpikeBuffer
		fillSpikeGenBits(netId);

		// copy the spikeGenBits from the manager to the CPU runtime
		memcpy(runtimeData[netId].spikeGenBits, managerRuntimeData.spikeGenBits, sizeof(int) * (networkConfigs[netId].numNSpikeGen / 32 + 1));
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperSpikeGeneratorUpdate_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> spikeGeneratorUpdate_CPU(args->netId);
		pthread_exit(0);
	}
#endif

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::updateTimingTable_CPU(int netId) {
#else // POSIX
	void* SNN::updateTimingTable_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	runtimeData[netId].timeTableD2[simTimeMs + networkConfigs[netId].maxDelay + 1] = runtimeData[netId].spikeCountD2Sec + runtimeData[netId].spikeCountLastSecLeftD2;
	runtimeData[netId].timeTableD1[simTimeMs + networkConfigs[netId].maxDelay + 1] = runtimeData[netId].spikeCountD1Sec;
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperUpdateTimingTable_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> updateTimingTable_CPU(args->netId);
		pthread_exit(0);
	}
#endif

//void SNN::routeSpikes_CPU() {
//	int firingTableIdxD2, firingTableIdxD1;
//	int GtoLOffset;
//	// ToDo: route spikes using routing table. currently only exchange spikes between GPU0 and GPU1
//	// GPU0 -> GPU1
//	if (!groupPartitionLists[0].empty() && !groupPartitionLists[1].empty()) {
//		memcpy(managerRuntimeData.extFiringTableEndIdxD2, runtimeData[0].extFiringTableEndIdxD2, sizeof(int) * networkConfigs[0].numGroups);
//		memcpy(managerRuntimeData.extFiringTableEndIdxD1, runtimeData[0].extFiringTableEndIdxD1, sizeof(int) * networkConfigs[0].numGroups);
//		memcpy(managerRuntimeData.extFiringTableD2, runtimeData[0].extFiringTableD2, sizeof(int*) * networkConfigs[0].numGroups);
//		memcpy(managerRuntimeData.extFiringTableD1, runtimeData[0].extFiringTableD1, sizeof(int*) * networkConfigs[0].numGroups);
//		//KERNEL_DEBUG("GPU0 D1ex:%d/D2ex:%d", managerRuntimeData.extFiringTableEndIdxD1[0], managerRuntimeData.extFiringTableEndIdxD2[0]);
//
//		memcpy(managerRuntimeData.timeTableD2, runtimeData[1].timeTableD2, sizeof(int) * (1000 + glbNetworkConfig.maxDelay + 1));
//		memcpy(managerRuntimeData.timeTableD1, runtimeData[1].timeTableD1, sizeof(int) * (1000 + glbNetworkConfig.maxDelay + 1));
//		firingTableIdxD2 = managerRuntimeData.timeTableD2[simTimeMs + glbNetworkConfig.maxDelay + 1];
//		firingTableIdxD1 = managerRuntimeData.timeTableD1[simTimeMs + glbNetworkConfig.maxDelay + 1];
//		//KERNEL_DEBUG("GPU1 D1:%d/D2:%d", firingTableIdxD1, firingTableIdxD2);
//
//		for (int lGrpId = 0; lGrpId < networkConfigs[0].numGroups; lGrpId++) {
//			if (groupConfigs[0][lGrpId].hasExternalConnect && managerRuntimeData.extFiringTableEndIdxD2[lGrpId] > 0) {
//				memcpy(runtimeData[1].firingTableD2 + firingTableIdxD2,
//					managerRuntimeData.extFiringTableD2[lGrpId],
//					sizeof(int) * managerRuntimeData.extFiringTableEndIdxD2[lGrpId]);
//
//				for (std::list<GroupConfigMD>::iterator grpIt = groupPartitionLists[1].begin(); grpIt != groupPartitionLists[1].end(); grpIt++) {
//					if (grpIt->gGrpId == groupConfigs[0][lGrpId].gGrpId)
//						GtoLOffset = grpIt->GtoLOffset;
//				}
//
//				convertExtSpikesD2_CPU(1, firingTableIdxD2,
//					firingTableIdxD2 + managerRuntimeData.extFiringTableEndIdxD2[lGrpId],
//					GtoLOffset); // [StartIdx, EndIdx)
//				firingTableIdxD2 += managerRuntimeData.extFiringTableEndIdxD2[lGrpId];
//			}
//
//			if (groupConfigs[0][lGrpId].hasExternalConnect && managerRuntimeData.extFiringTableEndIdxD1[lGrpId] > 0) {
//				memcpy(runtimeData[1].firingTableD1 + firingTableIdxD1,
//					managerRuntimeData.extFiringTableD1[lGrpId],
//					sizeof(int) * managerRuntimeData.extFiringTableEndIdxD1[lGrpId]);
//
//				for (std::list<GroupConfigMD>::iterator grpIt = groupPartitionLists[1].begin(); grpIt != groupPartitionLists[1].end(); grpIt++) {
//					if (grpIt->gGrpId == groupConfigs[0][lGrpId].gGrpId)
//						GtoLOffset = grpIt->GtoLOffset;
//				}
//
//				convertExtSpikesD1_CPU(1, firingTableIdxD1,
//					firingTableIdxD1 + managerRuntimeData.extFiringTableEndIdxD1[lGrpId],
//					GtoLOffset); // [StartIdx, EndIdx)
//				firingTableIdxD1 += managerRuntimeData.extFiringTableEndIdxD1[lGrpId];
//
//			}
//			//KERNEL_DEBUG("GPU1 New D1:%d/D2:%d", firingTableIdxD1, firingTableIdxD2);
//		}
//		managerRuntimeData.timeTableD2[simTimeMs + glbNetworkConfig.maxDelay + 1] = firingTableIdxD2;
//		managerRuntimeData.timeTableD1[simTimeMs + glbNetworkConfig.maxDelay + 1] = firingTableIdxD1;
//		memcpy(runtimeData[1].timeTableD2, managerRuntimeData.timeTableD2, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//		memcpy(runtimeData[1].timeTableD1, managerRuntimeData.timeTableD1, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//	}
//
//	// GPU1 -> GPU0
//	if (!groupPartitionLists[1].empty() && !groupPartitionLists[0].empty()) {
//		memcpy(managerRuntimeData.extFiringTableEndIdxD2, runtimeData[1].extFiringTableEndIdxD2, sizeof(int) * networkConfigs[1].numGroups);
//		memcpy(managerRuntimeData.extFiringTableEndIdxD1, runtimeData[1].extFiringTableEndIdxD1, sizeof(int) * networkConfigs[1].numGroups);
//		memcpy(managerRuntimeData.extFiringTableD2, runtimeData[1].extFiringTableD2, sizeof(int*) * networkConfigs[1].numGroups);
//		memcpy(managerRuntimeData.extFiringTableD1, runtimeData[1].extFiringTableD1, sizeof(int*) * networkConfigs[1].numGroups);
//		//KERNEL_DEBUG("GPU1 D1ex:%d/D2ex:%d", managerRuntimeData.extFiringTableEndIdxD1[0], managerRuntimeData.extFiringTableEndIdxD2[0]);
//
//		memcpy(managerRuntimeData.timeTableD2, runtimeData[0].timeTableD2, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//		memcpy(managerRuntimeData.timeTableD1, runtimeData[0].timeTableD1, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//		firingTableIdxD2 = managerRuntimeData.timeTableD2[simTimeMs + glbNetworkConfig.maxDelay + 1];
//		firingTableIdxD1 = managerRuntimeData.timeTableD1[simTimeMs + glbNetworkConfig.maxDelay + 1];
//		//KERNEL_DEBUG("GPU0 D1:%d/D2:%d", firingTableIdxD1, firingTableIdxD2);
//
//		for (int lGrpId = 0; lGrpId < networkConfigs[1].numGroups; lGrpId++) {
//			if (groupConfigs[1][lGrpId].hasExternalConnect && managerRuntimeData.extFiringTableEndIdxD2[lGrpId] > 0) {
//				memcpy(runtimeData[0].firingTableD2 + firingTableIdxD2,
//					managerRuntimeData.extFiringTableD2[lGrpId],
//					sizeof(int) * managerRuntimeData.extFiringTableEndIdxD2[lGrpId]);
//
//				for (std::list<GroupConfigMD>::iterator grpIt = groupPartitionLists[0].begin(); grpIt != groupPartitionLists[0].end(); grpIt++) {
//					if (grpIt->gGrpId == groupConfigs[1][lGrpId].gGrpId)
//						GtoLOffset = grpIt->GtoLOffset;
//				}
//
//				convertExtSpikesD2_CPU(0, firingTableIdxD2,
//					firingTableIdxD2 + managerRuntimeData.extFiringTableEndIdxD2[lGrpId],
//					GtoLOffset); // [StartIdx, EndIdx)
//				firingTableIdxD2 += managerRuntimeData.extFiringTableEndIdxD2[lGrpId];
//			}
//
//			if (groupConfigs[1][lGrpId].hasExternalConnect && managerRuntimeData.extFiringTableEndIdxD1[lGrpId] > 0) {
//				memcpy(runtimeData[0].firingTableD1 + firingTableIdxD1,
//					managerRuntimeData.extFiringTableD1[lGrpId],
//					sizeof(int) * managerRuntimeData.extFiringTableEndIdxD1[lGrpId]);
//
//				for (std::list<GroupConfigMD>::iterator grpIt = groupPartitionLists[0].begin(); grpIt != groupPartitionLists[0].end(); grpIt++) {
//					if (grpIt->gGrpId == groupConfigs[1][lGrpId].gGrpId)
//						GtoLOffset = grpIt->GtoLOffset;
//				}
//
//				convertExtSpikesD1_CPU(0, firingTableIdxD1,
//					firingTableIdxD1 + managerRuntimeData.extFiringTableEndIdxD1[lGrpId],
//					GtoLOffset); // [StartIdx, EndIdx)
//				firingTableIdxD1 += managerRuntimeData.extFiringTableEndIdxD1[lGrpId];
//			}
//			//KERNEL_DEBUG("GPU0 New D1:%d/D2:%d", firingTableIdxD1, firingTableIdxD2);
//		}
//		managerRuntimeData.timeTableD2[simTimeMs + glbNetworkConfig.maxDelay + 1] = firingTableIdxD2;
//		managerRuntimeData.timeTableD1[simTimeMs + glbNetworkConfig.maxDelay + 1] = firingTableIdxD1;
//		memcpy(runtimeData[0].timeTableD2, managerRuntimeData.timeTableD2, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//		memcpy(runtimeData[0].timeTableD1, managerRuntimeData.timeTableD1, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
//	}
//
//}

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::convertExtSpikesD2_CPU(int netId, int startIdx, int endIdx, int GtoLOffset) {
#else // POSIX
	void* SNN::convertExtSpikesD2_CPU(int netId, int startIdx, int endIdx, int GtoLOffset) {
#endif
	int spikeCountExtRx = endIdx - startIdx; // received external spike count

	runtimeData[netId].spikeCountD2Sec += spikeCountExtRx;
	runtimeData[netId].spikeCountExtRxD2 += spikeCountExtRx;
	runtimeData[netId].spikeCountExtRxD2Sec += spikeCountExtRx;

	// FIXME: if endIdx - startIdx > 64 * 128
	//if (firingTableIdx < endIdx)
	for (int extIdx = startIdx; extIdx < endIdx; extIdx++)
		runtimeData[netId].firingTableD2[extIdx] += GtoLOffset;
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperConvertExtSpikesD2_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> convertExtSpikesD2_CPU(args->netId, args->startIdx, args->endIdx, args->GtoLOffset);
		pthread_exit(0);
	}
#endif

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::convertExtSpikesD1_CPU(int netId, int startIdx, int endIdx, int GtoLOffset) {
#else // POSIX
	void* SNN::convertExtSpikesD1_CPU(int netId, int startIdx, int endIdx, int GtoLOffset) {
#endif
	int spikeCountExtRx = endIdx - startIdx; // received external spike count

	runtimeData[netId].spikeCountD1Sec += spikeCountExtRx;
	runtimeData[netId].spikeCountExtRxD1 += spikeCountExtRx;
	runtimeData[netId].spikeCountExtRxD1Sec += spikeCountExtRx;

	// FIXME: if endIdx - startIdx > 64 * 128
	for (int extIdx = startIdx; extIdx < endIdx; extIdx++)
		runtimeData[netId].firingTableD1[extIdx] += GtoLOffset;
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperConvertExtSpikesD1_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> convertExtSpikesD1_CPU(args->netId, args->startIdx, args->endIdx, args->GtoLOffset);
		pthread_exit(0);
	}
#endif

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::clearExtFiringTable_CPU(int netId) {
#else // POSIX
	void* SNN::clearExtFiringTable_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	memset(runtimeData[netId].extFiringTableEndIdxD1, 0, sizeof(int) * networkConfigs[netId].numGroups);
	memset(runtimeData[netId].extFiringTableEndIdxD2, 0, sizeof(int) * networkConfigs[netId].numGroups);
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperClearExtFiringTable_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> clearExtFiringTable_CPU(args->netId);
		pthread_exit(0);
	}
#endif

void SNN::copyTimeTable(int netId, bool toManager) {
	assert(netId >= CPU_RUNTIME_BASE);

	if (toManager) {
		memcpy(managerRuntimeData.timeTableD2, runtimeData[netId].timeTableD2, sizeof(int) * (1000 + glbNetworkConfig.maxDelay + 1));
		memcpy(managerRuntimeData.timeTableD1, runtimeData[netId].timeTableD1, sizeof(int) * (1000 + glbNetworkConfig.maxDelay + 1));
	} else {
		memcpy(runtimeData[netId].timeTableD2, managerRuntimeData.timeTableD2, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
		memcpy(runtimeData[netId].timeTableD1, managerRuntimeData.timeTableD1, sizeof(int)*(1000 + glbNetworkConfig.maxDelay + 1));
	}
}

void SNN::copyExtFiringTable(int netId) {
	assert(netId >= CPU_RUNTIME_BASE);

	memcpy(managerRuntimeData.extFiringTableEndIdxD2, runtimeData[netId].extFiringTableEndIdxD2, sizeof(int) * networkConfigs[netId].numGroups);
	memcpy(managerRuntimeData.extFiringTableEndIdxD1, runtimeData[netId].extFiringTableEndIdxD1, sizeof(int) * networkConfigs[netId].numGroups);
	memcpy(managerRuntimeData.extFiringTableD2, runtimeData[netId].extFiringTableD2, sizeof(int*) * networkConfigs[netId].numGroups);
	memcpy(managerRuntimeData.extFiringTableD1, runtimeData[netId].extFiringTableD1, sizeof(int*) * networkConfigs[netId].numGroups);
	//KERNEL_DEBUG("GPU0 D1ex:%d/D2ex:%d", managerRuntimeData.extFiringTableEndIdxD1[0], managerRuntimeData.extFiringTableEndIdxD2[0]);
}

// resets nSpikeCnt[]
// used for management of manager runtime data
// FIXME: make sure this is right when separating cpu_module to a standalone class
// FIXME: currently this function clear nSpikeCnt of manager runtime data
#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::resetSpikeCnt_CPU(int netId, int lGrpId) {
#else // POSIX
	void* SNN::resetSpikeCnt_CPU(int netId, int lGrpId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	if (lGrpId == ALL) {
		memset(runtimeData[netId].nSpikeCnt, 0, sizeof(int) * networkConfigs[netId].numN);
	} else {
		int lStartN = groupConfigs[netId][lGrpId].lStartN;
		int numN = groupConfigs[netId][lGrpId].numN;
		memset(runtimeData[netId].nSpikeCnt + lStartN, 0, sizeof(int) * numN);
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperResetSpikeCnt_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> resetSpikeCnt_CPU(args->netId, args->lGrpId);
		pthread_exit(0);
	}
#endif

// This method loops through all spikes that are generated by neurons with a delay of 1ms
// and delivers the spikes to the appropriate post-synaptic neuron
#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::doCurrentUpdateD1_CPU(int netId) {
#else // POSIX
	void* SNN::doCurrentUpdateD1_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	int k     = runtimeData[netId].timeTableD1[simTimeMs + networkConfigs[netId].maxDelay + 1] - 1;
	int k_end = runtimeData[netId].timeTableD1[simTimeMs + networkConfigs[netId].maxDelay];

	while((k >= k_end) && (k >= 0)) {
		int lNId = runtimeData[netId].firingTableD1[k];
		//assert(lNId < networkConfigs[netId].numN);

		DelayInfo dPar = runtimeData[netId].postDelayInfo[lNId * (networkConfigs[netId].maxDelay + 1)];

		unsigned int offset = runtimeData[netId].cumulativePost[lNId];

		for(int idx_d = dPar.delay_index_start; idx_d < (dPar.delay_index_start + dPar.delay_length); idx_d = idx_d + 1) {
			// get synaptic info...
			SynInfo postInfo = runtimeData[netId].postSynapticIds[offset + idx_d];

			int postNId = GET_CONN_NEURON_ID(postInfo);
			assert(postNId < networkConfigs[netId].numNAssigned);

			int synId = GET_CONN_SYN_ID(postInfo);
			assert(synId < (runtimeData[netId].Npre[postNId]));

			if (postNId < networkConfigs[netId].numN) // test if post-neuron is a local neuron
				generatePostSynapticSpike(lNId /* preNId */, postNId, synId, 0, netId);
		}

		k = k - 1;
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperDoCurrentUpdateD1_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> doCurrentUpdateD1_CPU(args->netId);
		pthread_exit(0);
	}
#endif

// This method loops through all spikes that are generated by neurons with a delay of 2+ms
// and delivers the spikes to the appropriate post-synaptic neuron
#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::doCurrentUpdateD2_CPU(int netId) {
#else // POSIX
	void* SNN::doCurrentUpdateD2_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	if (networkConfigs[netId].maxDelay > 1) {
		int k = runtimeData[netId].timeTableD2[simTimeMs + 1 + networkConfigs[netId].maxDelay] - 1;
		int k_end = runtimeData[netId].timeTableD2[simTimeMs + 1];
		int t_pos = simTimeMs;

		while ((k >= k_end) && (k >= 0)) {
			// get the neuron id from the index k
			int lNId = runtimeData[netId].firingTableD2[k];

			// find the time of firing from the timeTable using index k
			while (!((k >= runtimeData[netId].timeTableD2[t_pos + networkConfigs[netId].maxDelay]) && (k < runtimeData[netId].timeTableD2[t_pos + networkConfigs[netId].maxDelay + 1]))) {
				t_pos = t_pos - 1;
				assert((t_pos + networkConfigs[netId].maxDelay - 1) >= 0);
			}

			// \TODO: Instead of using the complex timeTable, can neuronFiringTime value...???
			// Calculate the time difference between time of firing of neuron and the current time...
			int tD = simTimeMs - t_pos;

			assert((tD < networkConfigs[netId].maxDelay) && (tD >= 0));
			//assert(lNId < networkConfigs[netId].numN);

			DelayInfo dPar = runtimeData[netId].postDelayInfo[lNId * (networkConfigs[netId].maxDelay + 1) + tD];

			unsigned int offset = runtimeData[netId].cumulativePost[lNId];

			// for each delay variables
			for (int idx_d = dPar.delay_index_start; idx_d < (dPar.delay_index_start + dPar.delay_length); idx_d = idx_d + 1) {
				// get synaptic info...
				SynInfo postInfo = runtimeData[netId].postSynapticIds[offset + idx_d];

				int postNId = GET_CONN_NEURON_ID(postInfo);
				assert(postNId < networkConfigs[netId].numNAssigned);

				int synId = GET_CONN_SYN_ID(postInfo);
				assert(synId < (runtimeData[netId].Npre[postNId]));

				if (postNId < networkConfigs[netId].numN) // test if post-neuron is a local neuron
					generatePostSynapticSpike(lNId /* preNId */, postNId, synId, tD, netId);
			}

			k = k - 1;
		}
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperDoCurrentUpdateD2_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> doCurrentUpdateD2_CPU(args->netId);
		pthread_exit(0);
	}
#endif

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::doSTPUpdateAndDecayCond_CPU(int netId) {
#else // POSIX
	void* SNN::doSTPUpdateAndDecayCond_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);
	// ToDo: This can be further optimized using multiple threads allocated on mulitple CPU cores
	//decay the STP variables before adding new spikes.
	for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
		for(int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++) {
			if (groupConfigs[netId][lGrpId].WithSTP) {
				int ind_plus  = STP_BUF_POS(lNId, simTime, glbNetworkConfig.maxDelay);
				int ind_minus = STP_BUF_POS(lNId, (simTime - 1), glbNetworkConfig.maxDelay);
				runtimeData[netId].stpu[ind_plus] = runtimeData[netId].stpu[ind_minus] * (1.0f - groupConfigs[netId][lGrpId].STP_tau_u_inv);
				runtimeData[netId].stpx[ind_plus] = runtimeData[netId].stpx[ind_minus] + (1.0f - runtimeData[netId].stpx[ind_minus]) * groupConfigs[netId][lGrpId].STP_tau_x_inv;
			}

			// decay conductances
			if (networkConfigs[netId].sim_with_conductances && IS_REGULAR_NEURON(lNId, networkConfigs[netId].numNReg, networkConfigs[netId].numNPois)) {
				runtimeData[netId].gAMPA[lNId]  *= dAMPA;
				if (sim_with_NMDA_rise) {
					runtimeData[netId].gNMDA_r[lNId] *= rNMDA;	// rise
					runtimeData[netId].gNMDA_d[lNId] *= dNMDA;	// decay
				} else {
					runtimeData[netId].gNMDA[lNId]   *= dNMDA;	// instantaneous rise
				}

				runtimeData[netId].gGABAa[lNId] *= dGABAa;
				if (sim_with_GABAb_rise) {
					runtimeData[netId].gGABAb_r[lNId] *= rGABAb;	// rise
					runtimeData[netId].gGABAb_d[lNId] *= dGABAb;	// decay
				} else {
					runtimeData[netId].gGABAb[lNId] *= dGABAb;	// instantaneous rise
				}
			}
		}
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperDoSTPUpdateAndDecayCond_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> doSTPUpdateAndDecayCond_CPU(args->netId);
		pthread_exit(0);
	}
#endif

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::findFiring_CPU(int netId) {
#else // POSIX
	void* SNN::findFiring_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);
	// ToDo: This can be further optimized using multiple threads allocated on mulitple CPU cores
	for(int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
		for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++) {
			bool needToWrite = false;
			// given group of neurons belong to the poisson group....
			if (groupConfigs[netId][lGrpId].Type & POISSON_NEURON) {
				if(groupConfigs[netId][lGrpId].isSpikeGenFunc) {
					unsigned int offset = lNId - groupConfigs[netId][lGrpId].lStartN + groupConfigs[netId][lGrpId].Noffset;
					needToWrite = getSpikeGenBit(offset, netId);
				} else { // spikes generated by poission rate
					needToWrite = getPoissonSpike(lNId, netId);
				}
				// Note: valid lastSpikeTime of spike gen neurons is required by userDefinedSpikeGenerator()
				if (needToWrite)
					runtimeData[netId].lastSpikeTime[lNId] = simTime;
			} else { // Regular neuron
				if (runtimeData[netId].curSpike[lNId]) {
					runtimeData[netId].curSpike[lNId] = false;
					needToWrite = true;
				}

				// log v, u value if any active neuron monitor is presented
				if (networkConfigs[netId].sim_with_nm && lNId - groupConfigs[netId][lGrpId].lStartN < MAX_NEURON_MON_GRP_SZIE) {
					int idxBase = networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * simTimeMs + lGrpId * MAX_NEURON_MON_GRP_SZIE;
					runtimeData[netId].nVBuffer[idxBase + lNId - groupConfigs[netId][lGrpId].lStartN] = runtimeData[netId].voltage[lNId];
					runtimeData[netId].nUBuffer[idxBase + lNId - groupConfigs[netId][lGrpId].lStartN] = runtimeData[netId].recovery[lNId];
					//KERNEL_INFO("simTimeMs: %d --base:%d -- %f -- %f --%f --%f", simTimeMs, idxBase + lNId - groupConfigs[netId][lGrpId].lStartN, runtimeData[netId].voltage[lNId], runtimeData[netId].recovery[lNId], runtimeData[netId].nVBuffer[idxBase + lNId - groupConfigs[netId][lGrpId].lStartN], runtimeData[netId].nUBuffer[idxBase + lNId - groupConfigs[netId][lGrpId].lStartN]);
				}
			}

			// his flag is set if with_stdp is set and also grpType is set to have GROUP_SYN_FIXED
			if (needToWrite) {
				bool hasSpace = false;
				int fireId = -1;

				// update spike count: spikeCountD2Sec(W), spikeCountD1Sec(W), spikeCountLastSecLeftD2(R)
				if (groupConfigs[netId][lGrpId].MaxDelay == 1)
				{
					if (runtimeData[netId].spikeCountD1Sec + 1 < networkConfigs[netId].maxSpikesD1) {
						fireId = runtimeData[netId].spikeCountD1Sec;
						runtimeData[netId].spikeCountD1Sec++;
					}
				} else { // MaxDelay > 1
					if (runtimeData[netId].spikeCountD2Sec + runtimeData[netId].spikeCountLastSecLeftD2 + 1 < networkConfigs[netId].maxSpikesD2) {
						fireId = runtimeData[netId].spikeCountD2Sec + runtimeData[netId].spikeCountLastSecLeftD2;
						runtimeData[netId].spikeCountD2Sec++;
					}
				}

				if (fireId == -1) // no space availabe in firing table, drop the spike
					continue;

				// update firing table: firingTableD1(W), firingTableD2(W)
				if (groupConfigs[netId][lGrpId].MaxDelay == 1) {
					runtimeData[netId].firingTableD1[fireId] = lNId;
				} else { // MaxDelay > 1
					runtimeData[netId].firingTableD2[fireId] = lNId;
				}

				// update external firing table: extFiringTableEndIdxD1(W), extFiringTableEndIdxD2(W), extFiringTableD1(W), extFiringTableD2(W)
				if (groupConfigs[netId][lGrpId].hasExternalConnect)     {
					int extFireId = -1;
					if (groupConfigs[netId][lGrpId].MaxDelay == 1) {
						extFireId = runtimeData[netId].extFiringTableEndIdxD1[lGrpId]++;
						runtimeData[netId].extFiringTableD1[lGrpId][extFireId] = lNId + groupConfigs[netId][lGrpId].LtoGOffset;
					} else { // MaxDelay > 1
						extFireId = runtimeData[netId].extFiringTableEndIdxD2[lGrpId]++;
						runtimeData[netId].extFiringTableD2[lGrpId][extFireId] = lNId + groupConfigs[netId][lGrpId].LtoGOffset;
					}
					assert(extFireId != -1);
				}

				// update STP for neurons that fire
				if (groupConfigs[netId][lGrpId].WithSTP) {
					firingUpdateSTP(lNId, lGrpId, netId);
				}

				// keep track of number spikes per neuron
				runtimeData[netId].nSpikeCnt[lNId]++;

				if (IS_REGULAR_NEURON(lNId, networkConfigs[netId].numNReg, networkConfigs[netId].numNPois))
					resetFiredNeuron(lNId, lGrpId, netId);

				// STDP calculation: the post-synaptic neuron fires after the arrival of a pre-synaptic spike
				if (!sim_in_testing && groupConfigs[netId][lGrpId].WithSTDP) {
					updateLTP(lNId, lGrpId, netId);
				}
			}
		}
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperFindFiring_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> findFiring_CPU(args->netId);
		pthread_exit(0);
	}
#endif


void SNN::updateLTP(int lNId, int lGrpId, int netId) {
	unsigned int pos_ij = runtimeData[netId].cumulativePre[lNId]; // the index of pre-synaptic neuron
	for(int j = 0; j < runtimeData[netId].Npre_plastic[lNId]; pos_ij++, j++) {
		int stdp_tDiff = (simTime - runtimeData[netId].synSpikeTime[pos_ij]);
		assert(!((stdp_tDiff < 0) && (runtimeData[netId].synSpikeTime[pos_ij] != MAX_SIMULATION_TIME)));

		if (stdp_tDiff > 0) {
			// check this is an excitatory or inhibitory synapse
			if (groupConfigs[netId][lGrpId].WithESTDP && runtimeData[netId].maxSynWt[pos_ij] >= 0) { // excitatory synapse
				// Handle E-STDP curve
				switch (groupConfigs[netId][lGrpId].WithESTDPcurve) {
				case EXP_CURVE: // exponential curve
					if (stdp_tDiff * groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC < 25)
						runtimeData[netId].wtChange[pos_ij] += STDP(stdp_tDiff, groupConfigs[netId][lGrpId].ALPHA_PLUS_EXC, groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC);
					break;
				case TIMING_BASED_CURVE: // sc curve
					if (stdp_tDiff * groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC < 25) {
						if (stdp_tDiff <= groupConfigs[netId][lGrpId].GAMMA)
							runtimeData[netId].wtChange[pos_ij] += groupConfigs[netId][lGrpId].OMEGA + groupConfigs[netId][lGrpId].KAPPA * STDP(stdp_tDiff, groupConfigs[netId][lGrpId].ALPHA_PLUS_EXC, groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC);
						else // stdp_tDiff > GAMMA
							runtimeData[netId].wtChange[pos_ij] -= STDP(stdp_tDiff, groupConfigs[netId][lGrpId].ALPHA_PLUS_EXC, groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC);
					}
					break;
				default:
					KERNEL_ERROR("Invalid E-STDP curve!");
					break;
				}
			} else if (groupConfigs[netId][lGrpId].WithISTDP && runtimeData[netId].maxSynWt[pos_ij] < 0) { // inhibitory synapse
				// Handle I-STDP curve																				 // Handle I-STDP curve
				switch (groupConfigs[netId][lGrpId].WithISTDPcurve) {
				case EXP_CURVE: // exponential curve
					if (stdp_tDiff * groupConfigs[netId][lGrpId].TAU_PLUS_INV_INB < 25) { // LTP of inhibitory synapse, which decreases synapse weight
						runtimeData[netId].wtChange[pos_ij] -= STDP(stdp_tDiff, groupConfigs[netId][lGrpId].ALPHA_PLUS_INB, groupConfigs[netId][lGrpId].TAU_PLUS_INV_INB);
					}
					break;
				case PULSE_CURVE: // pulse curve
					if (stdp_tDiff <= groupConfigs[netId][lGrpId].LAMBDA) { // LTP of inhibitory synapse, which decreases synapse weight
						runtimeData[netId].wtChange[pos_ij] -= groupConfigs[netId][lGrpId].BETA_LTP;
						//printf("I-STDP LTP\n");
					} else if (stdp_tDiff <= groupConfigs[netId][lGrpId].DELTA) { // LTD of inhibitory syanpse, which increase sysnapse weight
						runtimeData[netId].wtChange[pos_ij] -= groupConfigs[netId][lGrpId].BETA_LTD;
						//printf("I-STDP LTD\n");
					} else { /*do nothing*/}
					break;
				default:
					KERNEL_ERROR("Invalid I-STDP curve!");
					break;
				}
			}
		}
	}
}

void SNN::firingUpdateSTP(int lNId, int lGrpId, int netId) {
	// update the spike-dependent part of du/dt and dx/dt
	// we need to retrieve the STP values from the right buffer position (right before vs. right after the spike)
	int ind_plus = STP_BUF_POS(lNId, simTime, networkConfigs[netId].maxDelay); // index of right after the spike, such as in u^+
	int ind_minus = STP_BUF_POS(lNId, (simTime - 1), networkConfigs[netId].maxDelay); // index of right before the spike, such as in u^-

	// du/dt = -u/tau_F + U * (1-u^-) * \delta(t-t_{spk})
	runtimeData[netId].stpu[ind_plus] += groupConfigs[netId][lGrpId].STP_U * (1.0f - runtimeData[netId].stpu[ind_minus]);

	// dx/dt = (1-x)/tau_D - u^+ * x^- * \delta(t-t_{spk})
	runtimeData[netId].stpx[ind_plus] -= runtimeData[netId].stpu[ind_plus] * runtimeData[netId].stpx[ind_minus];
}

void SNN::resetFiredNeuron(int lNId, short int lGrpId, int netId) {
	if (groupConfigs[netId][lGrpId].WithSTDP)
		runtimeData[netId].lastSpikeTime[lNId] = simTime;

	if (networkConfigs[netId].sim_with_homeostasis) {
		// with homeostasis flag can be used here.
		runtimeData[netId].avgFiring[lNId] += 1000 / (groupConfigs[netId][lGrpId].avgTimeScale * 1000);
	}
}

bool SNN::getPoissonSpike(int lNId, int netId) {
	// Random number value is less than the poisson firing probability
	// if poisson firing probability is say 1.0 then the random poisson ptr
	// will always be less than 1.0 and hence it will continiously fire
	return runtimeData[netId].randNum[lNId - networkConfigs[netId].numNReg] * 1000.0f
			< runtimeData[netId].poissonFireRate[lNId - networkConfigs[netId].numNReg];
}

bool SNN::getSpikeGenBit(unsigned int nIdPos, int netId) {
	const int nIdBitPos = nIdPos % 32;
	const int nIdIndex  = nIdPos / 32;
	return ((runtimeData[netId].spikeGenBits[nIdIndex] >> nIdBitPos) & 0x1);
}

/*
* The sequence of handling an post synaptic spike in CPU mode:
* P1. Load wt into change (temporary variable)
* P2. Modulate change by STP (if enabled)
* P3-1. Modulate change by d_mulSynSlow and d_mulSynFast
* P3-2. Accumulate g(AMPA,NMDA,GABAa,GABAb) or current
* P4. Update synSpikeTime
* P5. Update DA,5HT,ACh,NE accordingly
* P6. Update STDP wtChange
* P7. Update v(voltage), u(recovery)
* P8. Update homeostasis
* P9. Decay and log DA,5HT,ACh,NE
*/
void SNN::generatePostSynapticSpike(int preNId, int postNId, int synId, int tD, int netId) {
	// get the cumulative position for quick access
	unsigned int pos = runtimeData[netId].cumulativePre[postNId] + synId;
	assert(postNId < networkConfigs[netId].numNReg); // \FIXME is this assert supposed to be for pos?

	// get group id of pre- / post-neuron
	short int post_grpId = runtimeData[netId].grpIds[postNId];
	short int pre_grpId = runtimeData[netId].grpIds[preNId];

	unsigned int pre_type = groupConfigs[netId][pre_grpId].Type;

	// get connect info from the cumulative synapse index for mulSynFast/mulSynSlow (requires less memory than storing
	// mulSynFast/Slow per synapse or storing a pointer to grpConnectInfo_s)
	// mulSynFast will be applied to fast currents (either AMPA or GABAa)
	// mulSynSlow will be applied to slow currents (either NMDA or GABAb)
	short int mulIndex = runtimeData[netId].connIdsPreIdx[pos];
	assert(mulIndex >= 0 && mulIndex < numConnections);

	// P1
	// for each presynaptic spike, postsynaptic (synaptic) current is going to increase by some amplitude (change)
	// generally speaking, this amplitude is the weight; but it can be modulated by STP
	float change = runtimeData[netId].wt[pos];

	// P2
	if (groupConfigs[netId][pre_grpId].WithSTP) {
		// if pre-group has STP enabled, we need to modulate the weight
		// NOTE: Order is important! (Tsodyks & Markram, 1998; Mongillo, Barak, & Tsodyks, 2008)
		// use u^+ (value right after spike-update) but x^- (value right before spike-update)

		// dI/dt = -I/tau_S + A * u^+ * x^- * \delta(t-t_{spk})
		// I noticed that for connect(.., RangeDelay(1), ..) tD will be 0
		int ind_minus = STP_BUF_POS(preNId, (simTime-tD-1), networkConfigs[netId].maxDelay);
		int ind_plus  = STP_BUF_POS(preNId, (simTime-tD), networkConfigs[netId].maxDelay);

		change *= groupConfigs[netId][pre_grpId].STP_A * runtimeData[netId].stpu[ind_plus] * runtimeData[netId].stpx[ind_minus];

		//printf("%d: %d[%d], numN=%d, td=%d, maxDelay_=%d, ind-=%d, ind+=%d, stpu=[%f,%f], stpx=[%f,%f], change=%f, wt=%f\n",
		//	simTime, pre_grpId, preNId,
		//	groupConfigs[netId][pre_grpId].numN, tD, networkConfigs[netId].maxDelay, ind_minus, ind_plus,
		//	runtimeData[netId].stpu[ind_minus], runtimeData[netId].stpu[ind_plus],
		//	runtimeData[netId].stpx[ind_minus], runtimeData[netId].stpx[ind_plus],
		//	change, runtimeData[netId].wt[pos]);
	}

	// P3-1, P3-2
	// update currents
	// NOTE: it's faster to += 0.0 rather than checking for zero and not updating
	if (sim_with_conductances) {
		if (pre_type & TARGET_AMPA) // if postNId expresses AMPAR
			runtimeData[netId].gAMPA [postNId] += change * mulSynFast[mulIndex]; // scale by some factor
		if (pre_type & TARGET_NMDA) {
			if (sim_with_NMDA_rise) {
				runtimeData[netId].gNMDA_r[postNId] += change * sNMDA * mulSynSlow[mulIndex];
				runtimeData[netId].gNMDA_d[postNId] += change * sNMDA * mulSynSlow[mulIndex];
			} else {
				runtimeData[netId].gNMDA [postNId] += change * mulSynSlow[mulIndex];
			}
		}
		if (pre_type & TARGET_GABAa)
			runtimeData[netId].gGABAa[postNId] -= change * mulSynFast[mulIndex]; // wt should be negative for GABAa and GABAb
		if (pre_type & TARGET_GABAb) {
			if (sim_with_GABAb_rise) {
				runtimeData[netId].gGABAb_r[postNId] -= change * sGABAb * mulSynSlow[mulIndex];
				runtimeData[netId].gGABAb_d[postNId] -= change * sGABAb * mulSynSlow[mulIndex];
			} else {
				runtimeData[netId].gGABAb[postNId] -= change * mulSynSlow[mulIndex];
			}
		}
	} else {
		runtimeData[netId].current[postNId] += change;
	}

	// P4
	runtimeData[netId].synSpikeTime[pos] = simTime;

	// P5
	// Got one spike from dopaminergic neuron, increase dopamine concentration in the target area
	if (pre_type & TARGET_DA) {
		runtimeData[netId].grpDA[post_grpId] += 0.04;
	}

	// P6
	// STDP calculation: the post-synaptic neuron fires before the arrival of a pre-synaptic spike
	if (!sim_in_testing && groupConfigs[netId][post_grpId].WithSTDP) {
		int stdp_tDiff = (simTime - runtimeData[netId].lastSpikeTime[postNId]);

		if (stdp_tDiff >= 0) {
			if (groupConfigs[netId][post_grpId].WithISTDP && ((pre_type & TARGET_GABAa) || (pre_type & TARGET_GABAb))) { // inhibitory syanpse
				// Handle I-STDP curve
				switch (groupConfigs[netId][post_grpId].WithISTDPcurve) {
				case EXP_CURVE: // exponential curve
					if (stdp_tDiff * groupConfigs[netId][post_grpId].TAU_MINUS_INV_INB < 25) { // LTD of inhibitory syanpse, which increase synapse weight
						runtimeData[netId].wtChange[pos] -= STDP(stdp_tDiff, groupConfigs[netId][post_grpId].ALPHA_MINUS_INB, groupConfigs[netId][post_grpId].TAU_MINUS_INV_INB);
					}
					break;
				case PULSE_CURVE: // pulse curve
					if (stdp_tDiff <= groupConfigs[netId][post_grpId].LAMBDA) { // LTP of inhibitory synapse, which decreases synapse weight
						runtimeData[netId].wtChange[pos] -= groupConfigs[netId][post_grpId].BETA_LTP;
					} else if (stdp_tDiff <= groupConfigs[netId][post_grpId].DELTA) { // LTD of inhibitory syanpse, which increase synapse weight
						runtimeData[netId].wtChange[pos] -= groupConfigs[netId][post_grpId].BETA_LTD;
					} else { /*do nothing*/ }
					break;
				default:
					KERNEL_ERROR("Invalid I-STDP curve");
					break;
				}
			} else if (groupConfigs[netId][post_grpId].WithESTDP && ((pre_type & TARGET_AMPA) || (pre_type & TARGET_NMDA))) { // excitatory synapse
				// Handle E-STDP curve
				switch (groupConfigs[netId][post_grpId].WithESTDPcurve) {
				case EXP_CURVE: // exponential curve
				case TIMING_BASED_CURVE: // sc curve
					if (stdp_tDiff * groupConfigs[netId][post_grpId].TAU_MINUS_INV_EXC < 25)
						runtimeData[netId].wtChange[pos] += STDP(stdp_tDiff, groupConfigs[netId][post_grpId].ALPHA_MINUS_EXC, groupConfigs[netId][post_grpId].TAU_MINUS_INV_EXC);
					break;
				default:
					KERNEL_ERROR("Invalid E-STDP curve");
					break;
				}
			} else { /*do nothing*/ }
		}
		assert(!((stdp_tDiff < 0) && (runtimeData[netId].lastSpikeTime[postNId] != MAX_SIMULATION_TIME)));
	}
}

// single integration step for voltage equation of 4-param Izhikevich
inline
float dvdtIzhikevich4(float volt, float recov, float totalCurrent, float timeStep = 1.0f) {
	return (((0.04f * volt + 5.0f) * volt + 140.0f - recov + totalCurrent) * timeStep);
}

// single integration step for recovery equation of 4-param Izhikevich
inline
float dudtIzhikevich4(float volt, float recov, float izhA, float izhB, float timeStep = 1.0f) {
	return (izhA * (izhB * volt - recov) * timeStep);
}

// single integration step for voltage equation of 9-param Izhikevich
inline
float dvdtIzhikevich9(float volt, float recov, float invCapac, float izhK, float voltRest,
	float voltInst, float totalCurrent, float timeStep = 1.0f)
{
	return ((izhK * (volt - voltRest) * (volt - voltInst) - recov + totalCurrent) * invCapac * timeStep);
}

// single integration step for recovery equation of 9-param Izhikevich
inline
float dudtIzhikevich9(float volt, float recov, float voltRest, float izhA, float izhB, float timeStep = 1.0f) {
	return (izhA * (izhB * (volt - voltRest) - recov) * timeStep);
}

// single integration step for voltage equation of LIF neurons
inline
float dvdtLIF(float volt, float lif_vReset, float lif_gain, float lif_bias, int lif_tau_m, float totalCurrent, float timeStep = 1.0f) {
	return ((lif_vReset -volt + ((totalCurrent * lif_gain) + lif_bias))/ (float) lif_tau_m) * timeStep;
}

float SNN::getCompCurrent(int netid, int lGrpId, int lneurId, float const0, float const1) {
	float compCurrent = 0.0f;
	for (int k = 0; k < groupConfigs[netid][lGrpId].numCompNeighbors; k++) {
		// compartment connections are always one-to-one, which means that the i-th neuron in grpId connects
		// to the i-th neuron in grpIdOther
		int lGrpIdOther = groupConfigs[netid][lGrpId].compNeighbors[k];
		int lneurIdOther = lneurId - groupConfigs[netid][lGrpId].lStartN + groupConfigs[netid][lGrpIdOther].lStartN;
		compCurrent += groupConfigs[netid][lGrpId].compCoupling[k] * ((runtimeData[netid].voltage[lneurIdOther] + const1)
			- (runtimeData[netid].voltage[lneurId] + const0));
	}

	return compCurrent;
}

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void  SNN::globalStateUpdate_CPU(int netId) {
#else // POSIX
	void*  SNN::globalStateUpdate_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	float timeStep = networkConfigs[netId].timeStep;

	// loop that allows smaller integration time step for v's and u's
	for (int j = 1; j <= networkConfigs[netId].simNumStepsPerMs; j++) {
		bool lastIter = (j == networkConfigs[netId].simNumStepsPerMs);
		for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
			if (groupConfigs[netId][lGrpId].Type & POISSON_NEURON) {
				if (groupConfigs[netId][lGrpId].WithHomeostasis & (lastIter)) {
					for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++)
						runtimeData[netId].avgFiring[lNId] *= groupConfigs[netId][lGrpId].avgTimeScale_decay;
				}
				continue;
			}

			for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++) {
				assert(lNId < networkConfigs[netId].numNReg);

				// P7
				// update conductances
				float v = runtimeData[netId].voltage[lNId];
				float v_next = runtimeData[netId].nextVoltage[lNId];
				float u = runtimeData[netId].recovery[lNId];
				float I_sum, NMDAtmp;
				float gNMDA, gGABAb;

				// pre-load izhikevich variables to avoid unnecessary memory accesses & unclutter the code.
				float k = runtimeData[netId].Izh_k[lNId];
				float vr = runtimeData[netId].Izh_vr[lNId];
				float vt = runtimeData[netId].Izh_vt[lNId];
				float inverse_C = 1.0f / runtimeData[netId].Izh_C[lNId];
				float vpeak = runtimeData[netId].Izh_vpeak[lNId];
				float a = runtimeData[netId].Izh_a[lNId];
				float b = runtimeData[netId].Izh_b[lNId];

				// pre-load LIF parameters
				int lif_tau_m = runtimeData[netId].lif_tau_m[lNId];
				int lif_tau_ref = runtimeData[netId].lif_tau_ref[lNId];
				int lif_tau_ref_c = runtimeData[netId].lif_tau_ref_c[lNId];
				float lif_vTh = runtimeData[netId].lif_vTh[lNId];
				float lif_vReset = runtimeData[netId].lif_vReset[lNId];
				float lif_gain = runtimeData[netId].lif_gain[lNId];
				float lif_bias = runtimeData[netId].lif_bias[lNId];

				float totalCurrent = runtimeData[netId].extCurrent[lNId];

				if (networkConfigs[netId].sim_with_conductances) {
					NMDAtmp = (v + 80.0f) * (v + 80.0f) / 60.0f / 60.0f;
					gNMDA = (networkConfigs[netId].sim_with_NMDA_rise) ? (runtimeData[netId].gNMDA_d[lNId] - runtimeData[netId].gNMDA_r[lNId]) : runtimeData[netId].gNMDA[lNId];
					gGABAb = (networkConfigs[netId].sim_with_GABAb_rise) ? (runtimeData[netId].gGABAb_d[lNId] - runtimeData[netId].gGABAb_r[lNId]) : runtimeData[netId].gGABAb[lNId];

					I_sum = -(runtimeData[netId].gAMPA[lNId] * (v - 0.0f)
						+ gNMDA * NMDAtmp / (1.0f + NMDAtmp) * (v - 0.0f)
						+ runtimeData[netId].gGABAa[lNId] * (v + 70.0f)
						+ gGABAb * (v + 90.0f));

					totalCurrent += I_sum;
				}
				else {
					totalCurrent += runtimeData[netId].current[lNId];
				}
				if (groupConfigs[netId][lGrpId].withCompartments) {
					totalCurrent += getCompCurrent(netId, lGrpId, lNId);
				}

				switch (networkConfigs[netId].simIntegrationMethod) {
				case FORWARD_EULER:
					if (!groupConfigs[netId][lGrpId].withParamModel_9 && !groupConfigs[netId][lGrpId].isLIF)
					{
						// update vpos and upos for the current neuron
						v_next = v + dvdtIzhikevich4(v, u, totalCurrent, timeStep);
						if (v_next > 30.0f) {
							v_next = 30.0f; // break the loop but evaluate u[i]
							runtimeData[netId].curSpike[lNId] = true;
							v_next = runtimeData[netId].Izh_c[lNId];
							u += runtimeData[netId].Izh_d[lNId];
						}
					}
					else if (!groupConfigs[netId][lGrpId].isLIF)
					{
						// update vpos and upos for the current neuron
						v_next = v + dvdtIzhikevich9(v, u, inverse_C, k, vr, vt, totalCurrent, timeStep);
						if (v_next > vpeak) {
							v_next = vpeak; // break the loop but evaluate u[i]
							runtimeData[netId].curSpike[lNId] = true;
							v_next = runtimeData[netId].Izh_c[lNId];
							u += runtimeData[netId].Izh_d[lNId];
						}
					}

					else{
						if (lif_tau_ref_c > 0){
							if(lastIter){
								runtimeData[netId].lif_tau_ref_c[lNId] -= 1;
								v_next = lif_vReset;
							}
						}
						else{
							if (v_next > lif_vTh) {
								runtimeData[netId].curSpike[lNId] = true;
								v_next = lif_vReset;

								if(lastIter){
                                        				runtimeData[netId].lif_tau_ref_c[lNId] = lif_tau_ref;
								}
								else{
									runtimeData[netId].lif_tau_ref_c[lNId] = lif_tau_ref + 1;
								}
							}
							else{
								v_next = v + dvdtLIF(v, lif_vReset, lif_gain, lif_bias, lif_tau_m, totalCurrent, timeStep);
							}
						}
					}

					if (groupConfigs[netId][lGrpId].isLIF){
						if (v_next < lif_vReset) v_next = lif_vReset;
					}
					else{
						if (v_next < -90.0f) v_next = -90.0f;

						if (!groupConfigs[netId][lGrpId].withParamModel_9)
						{
							u += dudtIzhikevich4(v_next, u, a, b, timeStep);
						}
						else
						{
							u += dudtIzhikevich9(v_next, u, vr, a, b, timeStep);
						}
					}
					break;

				case RUNGE_KUTTA4:

					if (!groupConfigs[netId][lGrpId].withParamModel_9 && !groupConfigs[netId][lGrpId].isLIF) {
						// 4-param Izhikevich
						float k1 = dvdtIzhikevich4(v, u, totalCurrent, timeStep);
						float l1 = dudtIzhikevich4(v, u, a, b, timeStep);

						float k2 = dvdtIzhikevich4(v + k1 / 2.0f, u + l1 / 2.0f, totalCurrent,
							timeStep);
						float l2 = dudtIzhikevich4(v + k1 / 2.0f, u + l1 / 2.0f, a, b, timeStep);

						float k3 = dvdtIzhikevich4(v + k2 / 2.0f, u + l2 / 2.0f, totalCurrent,
							timeStep);
						float l3 = dudtIzhikevich4(v + k2 / 2.0f, u + l2 / 2.0f, a, b, timeStep);

						float k4 = dvdtIzhikevich4(v + k3, u + l3, totalCurrent, timeStep);
						float l4 = dudtIzhikevich4(v + k3, u + l3, a, b, timeStep);
						v_next = v + (1.0f / 6.0f) * (k1 + 2.0f * k2 + 2.0f * k3 + k4);
						if (v_next > 30.0f) {
							v_next = 30.0f;
							runtimeData[netId].curSpike[lNId] = true;
							v_next = runtimeData[netId].Izh_c[lNId];
							u += runtimeData[netId].Izh_d[lNId];
						}
						if (v_next < -90.0f) v_next = -90.0f;

						u += (1.0f / 6.0f) * (l1 + 2.0f * l2 + 2.0f * l3 + l4);
					}
					else if(!groupConfigs[netId][lGrpId].isLIF){
						// 9-param Izhikevich
						float k1 = dvdtIzhikevich9(v, u, inverse_C, k, vr, vt, totalCurrent,
							timeStep);
						float l1 = dudtIzhikevich9(v, u, vr, a, b, timeStep);

						float k2 = dvdtIzhikevich9(v + k1 / 2.0f, u + l1 / 2.0f, inverse_C, k, vr, vt,
							totalCurrent, timeStep);
						float l2 = dudtIzhikevich9(v + k1 / 2.0f, u + l1 / 2.0f, vr, a, b, timeStep);

						float k3 = dvdtIzhikevich9(v + k2 / 2.0f, u + l2 / 2.0f, inverse_C, k, vr, vt,
							totalCurrent, timeStep);
						float l3 = dudtIzhikevich9(v + k2 / 2.0f, u + l2 / 2.0f, vr, a, b, timeStep);

						float k4 = dvdtIzhikevich9(v + k3, u + l3, inverse_C, k, vr, vt,
							totalCurrent, timeStep);
						float l4 = dudtIzhikevich9(v + k3, u + l3, vr, a, b, timeStep);

						v_next = v + (1.0f / 6.0f) * (k1 + 2.0f * k2 + 2.0f * k3 + k4);

						if (v_next > vpeak) {
							v_next = vpeak; // break the loop but evaluate u[i]
							runtimeData[netId].curSpike[lNId] = true;
							v_next = runtimeData[netId].Izh_c[lNId];
							u += runtimeData[netId].Izh_d[lNId];
						}

						if (v_next < -90.0f) v_next = -90.0f;

						u += (1.0f / 6.0f) * (l1 + 2.0f * l2 + 2.0f * l3 + l4);
					}
					else{
						//LIF integration is always FORWARD_EULER
						if (lif_tau_ref_c > 0){
							if(lastIter){
								runtimeData[netId].lif_tau_ref_c[lNId] -= 1;
								v_next = lif_vReset;
							}
						}
						else{
							if (v_next > lif_vTh) {
								runtimeData[netId].curSpike[lNId] = true;
								v_next = lif_vReset;

								if(lastIter){
                                        				runtimeData[netId].lif_tau_ref_c[lNId] = lif_tau_ref;
								}
								else{
									runtimeData[netId].lif_tau_ref_c[lNId] = lif_tau_ref + 1;
								}
							}
							else{
								v_next = v + dvdtLIF(v, lif_vReset, lif_gain, lif_bias, lif_tau_m, totalCurrent, timeStep);
							}
						}
						if (v_next < lif_vReset) v_next = lif_vReset;
					}
					break;
				case UNKNOWN_INTEGRATION:
				default:
					exitSimulation(1);
				}

				runtimeData[netId].nextVoltage[lNId] = v_next;
				runtimeData[netId].recovery[lNId] = u;

				// update current & average firing rate for homeostasis once per globalStateUpdate_CPU call
				if (lastIter)
				{
					if (networkConfigs[netId].sim_with_conductances) {
						runtimeData[netId].current[lNId] = I_sum;
					}
					else {
						// current must be reset here for CUBA and not STPUpdateAndDecayConductances
						runtimeData[netId].current[lNId] = 0.0f;
					}

					// P8
					// update average firing rate for homeostasis
					if (groupConfigs[netId][lGrpId].WithHomeostasis)
						runtimeData[netId].avgFiring[lNId] *= groupConfigs[netId][lGrpId].avgTimeScale_decay;

					// log i value if any active neuron monitor is presented
					if (networkConfigs[netId].sim_with_nm && lNId - groupConfigs[netId][lGrpId].lStartN < MAX_NEURON_MON_GRP_SZIE) {
						int idxBase = networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * simTimeMs + lGrpId * MAX_NEURON_MON_GRP_SZIE;
						runtimeData[netId].nIBuffer[idxBase + lNId - groupConfigs[netId][lGrpId].lStartN] = totalCurrent;
					}
				}
			} // end StartN...EndN

			  // decay dopamine concentration once per globalStateUpdate_CPU call
			if (lastIter)
			{
				// P9
				// decay dopamine concentration
				if ((groupConfigs[netId][lGrpId].WithESTDPtype == DA_MOD || groupConfigs[netId][lGrpId].WithISTDP == DA_MOD) && runtimeData[netId].grpDA[lGrpId] > groupConfigs[netId][lGrpId].baseDP) {
					runtimeData[netId].grpDA[lGrpId] *= groupConfigs[netId][lGrpId].decayDP;
				}
				runtimeData[netId].grpDABuffer[lGrpId * 1000 + simTimeMs] = runtimeData[netId].grpDA[lGrpId];
			}
		} // end numGroups

		  // Only after we are done computing nextVoltage for all neurons do we copy the new values to the voltage array.
		  // This is crucial for GPU (asynchronous kernel launch) and in the future for a multi-threaded CARLsim version.

		memcpy(runtimeData[netId].voltage, runtimeData[netId].nextVoltage, sizeof(float)*networkConfigs[netId].numNReg);

	} // end simNumStepsPerMs loop
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperGlobalStateUpdate_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> globalStateUpdate_CPU(args->netId);
		pthread_exit(0);
	}
#endif

// This function updates the synaptic weights from its derivatives..
#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::updateWeights_CPU(int netId) {
#else // POSIX
	void* SNN::updateWeights_CPU(int netId) {
#endif
	// at this point we have already checked for sim_in_testing and sim_with_fixedwts
	assert(sim_in_testing==false);
	assert(sim_with_fixedwts==false);
	assert(runtimeData[netId].memType == CPU_MEM);

	// update synaptic weights here for all the neurons..
	for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
		// no changable weights so continue without changing..
		if (groupConfigs[netId][lGrpId].FixedInputWts || !(groupConfigs[netId][lGrpId].WithSTDP))
			continue;

		for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++) {
			assert(lNId < networkConfigs[netId].numNReg);
			unsigned int offset = runtimeData[netId].cumulativePre[lNId];
			float diff_firing = 0.0;
			float homeostasisScale = 1.0;

			if (groupConfigs[netId][lGrpId].WithHomeostasis) {
				assert(runtimeData[netId].baseFiring[lNId] > 0);
				diff_firing = 1 - runtimeData[netId].avgFiring[lNId] / runtimeData[netId].baseFiring[lNId];
				homeostasisScale = groupConfigs[netId][lGrpId].homeostasisScale;
			}

			if (lNId == groupConfigs[netId][lGrpId].lStartN)
				KERNEL_DEBUG("Weights, Change at %d (diff_firing: %f)", simTimeSec, diff_firing);

			for (int j = 0; j < runtimeData[netId].Npre_plastic[lNId]; j++) {
				//	if (i==groupConfigs[0][g].StartN)
				//		KERNEL_DEBUG("%1.2f %1.2f \t", wt[offset+j]*10, wtChange[offset+j]*10);
				float effectiveWtChange = stdpScaleFactor_ * runtimeData[netId].wtChange[offset + j];
				//				if (wtChange[offset+j])
				//					printf("connId=%d, wtChange[%d]=%f\n",connIdsPreIdx[offset+j],offset+j,wtChange[offset+j]);

								// homeostatic weight update
								// FIXME: check WithESTDPtype and WithISTDPtype first and then do weight change update
				switch (groupConfigs[netId][lGrpId].WithESTDPtype) {
				case STANDARD:
					if (groupConfigs[netId][lGrpId].WithHomeostasis) {
						runtimeData[netId].wt[offset + j] += (diff_firing*runtimeData[netId].wt[offset + j] * homeostasisScale + runtimeData[netId].wtChange[offset + j])*runtimeData[netId].baseFiring[lNId] / groupConfigs[netId][lGrpId].avgTimeScale / (1 + fabs(diff_firing) * 50);
					} else {
						// just STDP weight update
						runtimeData[netId].wt[offset + j] += effectiveWtChange;
					}
					break;
				case DA_MOD:
					if (groupConfigs[netId][lGrpId].WithHomeostasis) {
						effectiveWtChange = runtimeData[netId].grpDA[lGrpId] * effectiveWtChange;
						runtimeData[netId].wt[offset + j] += (diff_firing*runtimeData[netId].wt[offset + j] * homeostasisScale + effectiveWtChange)*runtimeData[netId].baseFiring[lNId] / groupConfigs[netId][lGrpId].avgTimeScale / (1 + fabs(diff_firing) * 50);
					} else {
						runtimeData[netId].wt[offset + j] += runtimeData[netId].grpDA[lGrpId] * effectiveWtChange;
					}
					break;
				case UNKNOWN_STDP:
				default:
					// we shouldn't even be in here if !WithSTDP
					break;
				}

				switch (groupConfigs[netId][lGrpId].WithISTDPtype) {
				case STANDARD:
					if (groupConfigs[netId][lGrpId].WithHomeostasis) {
						runtimeData[netId].wt[offset + j] += (diff_firing*runtimeData[netId].wt[offset + j] * homeostasisScale + runtimeData[netId].wtChange[offset + j])*runtimeData[netId].baseFiring[lNId] / groupConfigs[netId][lGrpId].avgTimeScale / (1 + fabs(diff_firing) * 50);
					} else {
						// just STDP weight update
						runtimeData[netId].wt[offset + j] += effectiveWtChange;
					}
					break;
				case DA_MOD:
					if (groupConfigs[netId][lGrpId].WithHomeostasis) {
						effectiveWtChange = runtimeData[netId].grpDA[lGrpId] * effectiveWtChange;
						runtimeData[netId].wt[offset + j] += (diff_firing*runtimeData[netId].wt[offset + j] * homeostasisScale + effectiveWtChange)*runtimeData[netId].baseFiring[lNId] / groupConfigs[netId][lGrpId].avgTimeScale / (1 + fabs(diff_firing) * 50);
					} else {
						runtimeData[netId].wt[offset + j] += runtimeData[netId].grpDA[lGrpId] * effectiveWtChange;
					}
					break;
				case UNKNOWN_STDP:
				default:
					// we shouldn't even be in here if !WithSTDP
					break;
				}

				// It is users' choice to decay weight change or not
				// see setWeightAndWeightChangeUpdate()
				runtimeData[netId].wtChange[offset + j] *= wtChangeDecay_;

				// if this is an excitatory or inhibitory synapse
				if (runtimeData[netId].maxSynWt[offset + j] >= 0) {
					if (runtimeData[netId].wt[offset + j] >= runtimeData[netId].maxSynWt[offset + j])
						runtimeData[netId].wt[offset + j] = runtimeData[netId].maxSynWt[offset + j];
					if (runtimeData[netId].wt[offset + j] < 0)
						runtimeData[netId].wt[offset + j] = 0.0;
				}
				else {
					if (runtimeData[netId].wt[offset + j] <= runtimeData[netId].maxSynWt[offset + j])
						runtimeData[netId].wt[offset + j] = runtimeData[netId].maxSynWt[offset + j];
					if (runtimeData[netId].wt[offset + j] > 0)
						runtimeData[netId].wt[offset + j] = 0.0;
				}
			}
		}
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperUpdateWeights_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> updateWeights_CPU(args->netId);
		pthread_exit(0);
	}
#endif

/*!
 * \brief This function is called every second by SNN::runNetwork(). It updates the firingTableD1(D2) and
 * timeTableD1(D2) by removing older firing information.
 */
 #if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::shiftSpikeTables_CPU(int netId) {
#else // POSIX
	void* SNN::shiftSpikeTables_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);
	// Read the neuron ids that fired in the last glbNetworkConfig.maxDelay seconds
	// and put it to the beginning of the firing table...
	for(int p = runtimeData[netId].timeTableD2[999], k = 0; p < runtimeData[netId].timeTableD2[999 + networkConfigs[netId].maxDelay + 1]; p++, k++) {
		runtimeData[netId].firingTableD2[k] = runtimeData[netId].firingTableD2[p];
	}

	for(int i = 0; i < networkConfigs[netId].maxDelay; i++) {
		runtimeData[netId].timeTableD2[i + 1] = runtimeData[netId].timeTableD2[1000 + i + 1] - runtimeData[netId].timeTableD2[1000];
		runtimeData[netId].timeTableD1[i + 1] = runtimeData[netId].timeTableD1[1000 + i + 1] - runtimeData[netId].timeTableD1[1000];
	}

	runtimeData[netId].timeTableD1[networkConfigs[netId].maxDelay] = 0;
	runtimeData[netId].spikeCountD2 += runtimeData[netId].spikeCountD2Sec;
	runtimeData[netId].spikeCountD1 += runtimeData[netId].spikeCountD1Sec;

	runtimeData[netId].spikeCountD2Sec = 0;
	runtimeData[netId].spikeCountD1Sec = 0;

	runtimeData[netId].spikeCountExtRxD2Sec = 0;
	runtimeData[netId].spikeCountExtRxD1Sec = 0;

	runtimeData[netId].spikeCountLastSecLeftD2 = runtimeData[netId].timeTableD2[networkConfigs[netId].maxDelay];
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperShiftSpikeTables_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> shiftSpikeTables_CPU(args->netId);
		pthread_exit(0);
	}
#endif

void SNN::allocateSNN_CPU(int netId) {
	// setup memory type of CPU runtime data
	runtimeData[netId].memType = CPU_MEM;

	// display some memory management info
	//size_t avail, total, previous;
	//float toMB = std::pow(1024.0f, 2);
	//KERNEL_INFO("CPU Memory Management: (Total %2.3f MB)",(float)(total/toMB));
	//KERNEL_INFO("Data\t\t\tSize\t\tTotal Used\tTotal Available");
	//KERNEL_INFO("Init:\t\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(total)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// allocate SNN::runtimeData[0].randNum for random number generators
	runtimeData[netId].randNum = new float[networkConfigs[netId].numNPois];
	//KERNEL_INFO("Random Gen:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(previous-avail)/toMB, (float)((total-avail)/toMB),(float)(avail/toMB));
	//previous=avail;


	// initialize (copy from SNN) runtimeData[0].Npre, runtimeData[0].Npre_plastic, runtimeData[0].Npre_plasticInv, runtimeData[0].cumulativePre
	// initialize (copy from SNN) runtimeData[0].cumulativePost, runtimeData[0].Npost, runtimeData[0].postDelayInfo
	// initialize (copy from SNN) runtimeData[0].postSynapticIds, runtimeData[0].preSynapticIds
	copyPreConnectionInfo(netId, ALL, &runtimeData[netId], &managerRuntimeData, true);
	copyPostConnectionInfo(netId, ALL, &runtimeData[netId], &managerRuntimeData, true);
	//KERNEL_INFO("Conn Info:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(previous-avail)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// initialize (copy from SNN) runtimeData[0].wt, runtimeData[0].wtChange, runtimeData[0].maxSynWt
	copySynapseState(netId, &runtimeData[netId], &managerRuntimeData, true);
	//KERNEL_INFO("Syn State:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(previous-avail)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// copy the neuron state information to the CPU runtime
	// initialize (copy from managerRuntimeData) runtimeData[0].recovery, runtimeData[0].voltage, runtimeData[0].current
	// initialize (copy from managerRuntimeData) runtimeData[0].gGABAa, runtimeData[0].gGABAb, runtimeData[0].gAMPA, runtimeData[0].gNMDA
	// initialize (copy from SNN) runtimeData[0].Izh_a, runtimeData[0].Izh_b, runtimeData[0].Izh_c, runtimeData[0].Izh_d
	// initialize (copy form SNN) runtimeData[0].baseFiring, runtimeData[0].baseFiringInv
	// initialize (copy from SNN) runtimeData[0].n(V,U,I)Buffer[]
	copyNeuronState(netId, ALL, &runtimeData[netId], true);

	// copy STP state, considered as neuron state
	if (sim_with_stp) {
		// initialize (copy from SNN) stpu, stpx
		copySTPState(netId, ALL, &runtimeData[netId], &managerRuntimeData, true);
	}
	//KERNEL_INFO("Neuron State:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(previous-avail)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// initialize (copy from SNN) runtimeData[0].grpDA(5HT,ACh,NE)
	// initialize (copy from SNN) runtimeData[0].grpDA(5HT,ACh,NE)Buffer[]
	copyGroupState(netId, ALL, &runtimeData[netId], &managerRuntimeData, true);
	//KERNEL_INFO("Group State:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB",(float)(previous-avail)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// initialize (cudaMemset) runtimeData[0].I_set, runtimeData[0].poissonFireRate
	// initialize (copy from SNN) runtimeData[0].firingTableD1, runtimeData[0].firingTableD2
	// initialize (cudaMalloc) runtimeData[0].spikeGenBits
	// initialize (copy from managerRuntimeData) runtimeData[0].nSpikeCnt,
	// initialize (copy from SNN) runtimeData[0].synSpikeTime, runtimeData[0].lastSpikeTime
	copyAuxiliaryData(netId, ALL, &runtimeData[netId], true);
	//KERNEL_INFO("Auxiliary Data:\t\t%2.3f MB\t%2.3f MB\t%2.3f MB\n\n",(float)(previous-avail)/toMB,(float)((total-avail)/toMB), (float)(avail/toMB));
	//previous=avail;

	// TODO: move mulSynFast, mulSynSlow to ConnectConfig structure
	// copy connection configs
	//CUDA_CHECK_ERRORS(cudaMemcpyToSymbol(d_mulSynFast, mulSynFast, sizeof(float) * networkConfigs[netId].numConnections, 0, cudaMemcpyHostToDevice));
	//CUDA_CHECK_ERRORS(cudaMemcpyToSymbol(d_mulSynSlow, mulSynSlow, sizeof(float) * networkConfigs[netId].numConnections, 0, cudaMemcpyHostToDevice));

	KERNEL_DEBUG("Transfering group settings to CPU:");
	for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroupsAssigned; lGrpId++) {
		KERNEL_DEBUG("Settings for Group %s:", groupConfigMap[groupConfigs[netId][lGrpId].gGrpId].grpName.c_str());

		KERNEL_DEBUG("\tType: %d",(int)groupConfigs[netId][lGrpId].Type);
		KERNEL_DEBUG("\tNumN: %d",groupConfigs[netId][lGrpId].numN);
		KERNEL_DEBUG("\tM: %d",groupConfigs[netId][lGrpId].numPostSynapses);
		KERNEL_DEBUG("\tPreM: %d",groupConfigs[netId][lGrpId].numPreSynapses);
		KERNEL_DEBUG("\tspikeGenerator: %d",(int)groupConfigs[netId][lGrpId].isSpikeGenerator);
		KERNEL_DEBUG("\tFixedInputWts: %d",(int)groupConfigs[netId][lGrpId].FixedInputWts);
		KERNEL_DEBUG("\tMaxDelay: %d",(int)groupConfigs[netId][lGrpId].MaxDelay);
		KERNEL_DEBUG("\tWithSTDP: %d",(int)groupConfigs[netId][lGrpId].WithSTDP);
		if (groupConfigs[netId][lGrpId].WithSTDP) {
			KERNEL_DEBUG("\t\tE-STDP type: %s",stdpType_string[groupConfigs[netId][lGrpId].WithESTDPtype]);
			KERNEL_DEBUG("\t\tTAU_PLUS_INV_EXC: %f",groupConfigs[netId][lGrpId].TAU_PLUS_INV_EXC);
			KERNEL_DEBUG("\t\tTAU_MINUS_INV_EXC: %f",groupConfigs[netId][lGrpId].TAU_MINUS_INV_EXC);
			KERNEL_DEBUG("\t\tALPHA_PLUS_EXC: %f",groupConfigs[netId][lGrpId].ALPHA_PLUS_EXC);
			KERNEL_DEBUG("\t\tALPHA_MINUS_EXC: %f",groupConfigs[netId][lGrpId].ALPHA_MINUS_EXC);
			KERNEL_DEBUG("\t\tI-STDP type: %s",stdpType_string[groupConfigs[netId][lGrpId].WithISTDPtype]);
			KERNEL_DEBUG("\t\tTAU_PLUS_INV_INB: %f",groupConfigs[netId][lGrpId].TAU_PLUS_INV_INB);
			KERNEL_DEBUG("\t\tTAU_MINUS_INV_INB: %f",groupConfigs[netId][lGrpId].TAU_MINUS_INV_INB);
			KERNEL_DEBUG("\t\tALPHA_PLUS_INB: %f",groupConfigs[netId][lGrpId].ALPHA_PLUS_INB);
			KERNEL_DEBUG("\t\tALPHA_MINUS_INB: %f",groupConfigs[netId][lGrpId].ALPHA_MINUS_INB);
			KERNEL_DEBUG("\t\tLAMBDA: %f",groupConfigs[netId][lGrpId].LAMBDA);
			KERNEL_DEBUG("\t\tDELTA: %f",groupConfigs[netId][lGrpId].DELTA);
			KERNEL_DEBUG("\t\tBETA_LTP: %f",groupConfigs[netId][lGrpId].BETA_LTP);
			KERNEL_DEBUG("\t\tBETA_LTD: %f",groupConfigs[netId][lGrpId].BETA_LTD);
		}
		KERNEL_DEBUG("\tWithSTP: %d",(int)groupConfigs[netId][lGrpId].WithSTP);
		if (groupConfigs[netId][lGrpId].WithSTP) {
			KERNEL_DEBUG("\t\tSTP_U: %f", groupConfigs[netId][lGrpId].STP_U);
//				KERNEL_DEBUG("\t\tSTP_tD: %f",groupConfigs[netId][lGrpId].STP_tD);
//				KERNEL_DEBUG("\t\tSTP_tF: %f",groupConfigs[netId][lGrpId].STP_tF);
		}
		KERNEL_DEBUG("\tspikeGen: %s", groupConfigs[netId][lGrpId].isSpikeGenFunc? "is Set" : "is not set ");
	}

	// allocation of CPU runtime data is done
	runtimeData[netId].allocated = true;
}

/*!
 * \brief this function allocates memory sapce and copies information of pre-connections to it
 *
 * This function:
 * initialize Npre_plasticInv
 * (allocate and) copy Npre, Npre_plastic, Npre_plasticInv, cumulativePre, preSynapticIds
 * (allocate and) copy Npost, cumulativePost, postSynapticIds, postDelayInfo
 *
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU
 * \since v4.0
*/
void SNN::copyPreConnectionInfo(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	int lengthN, lengthSyn, posN, posSyn;

	if (lGrpId == ALL) {
		lengthN = networkConfigs[netId].numNAssigned;
		posN = 0;
	} else {
		lengthN = groupConfigs[netId][lGrpId].numN;
		posN = groupConfigs[netId][lGrpId].lStartN;
	}

	// connection synaptic lengths and cumulative lengths...
	if(allocateMem)
		dest->Npre = new unsigned short[networkConfigs[netId].numNAssigned];
	memcpy(&dest->Npre[posN], &src->Npre[posN], sizeof(short) * lengthN);

	// we don't need these data structures if the network doesn't have any plastic synapses at all
	if (!sim_with_fixedwts) {
		// presyn excitatory connections
		if(allocateMem)
			dest->Npre_plastic = new unsigned short[networkConfigs[netId].numNAssigned];
		memcpy(&dest->Npre_plastic[posN], &src->Npre_plastic[posN], sizeof(short) * lengthN);

		// Npre_plasticInv is only used on GPUs, only allocate and copy it during initialization
		if(allocateMem) {
			float* Npre_plasticInv = new float[networkConfigs[netId].numNAssigned];

			for (int i = 0; i < networkConfigs[netId].numNAssigned; i++)
				Npre_plasticInv[i] = 1.0f / managerRuntimeData.Npre_plastic[i];

			dest->Npre_plasticInv = new float[networkConfigs[netId].numNAssigned];
			memcpy(dest->Npre_plasticInv, Npre_plasticInv, sizeof(float) * networkConfigs[netId].numNAssigned);

			delete[] Npre_plasticInv;
		}
	}

	// beginning position for the pre-synaptic information
	if(allocateMem)
		dest->cumulativePre = new unsigned int[networkConfigs[netId].numNAssigned];
	memcpy(&dest->cumulativePre[posN], &src->cumulativePre[posN], sizeof(int) * lengthN);

	// Npre, cumulativePre has been copied to destination
	if (lGrpId == ALL) {
		lengthSyn = networkConfigs[netId].numPreSynNet;
		posSyn = 0;
	} else {
		lengthSyn = 0;
		for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++)
			lengthSyn += dest->Npre[lNId];

		posSyn = dest->cumulativePre[groupConfigs[netId][lGrpId].lStartN];
	}

	if(allocateMem)
		dest->preSynapticIds = new SynInfo[networkConfigs[netId].numPreSynNet];
	memcpy(&dest->preSynapticIds[posSyn], &src->preSynapticIds[posSyn], sizeof(SynInfo) * lengthSyn);
}

/*!
 * \brief this function allocates memory sapce and copies information of post-connections to it
 *
 * This function:
 * (allocate and) copy Npost, cumulativePost, postSynapticIds, postDelayInfo
 *
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU
 * \since v4.0
 */
void SNN::copyPostConnectionInfo(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	int lengthN, lengthSyn, posN, posSyn;

	if (lGrpId == ALL) {
		lengthN = networkConfigs[netId].numNAssigned;
		posN = 0;
	} else {
		lengthN = groupConfigs[netId][lGrpId].numN;
		posN = groupConfigs[netId][lGrpId].lStartN;
	}

	// number of postsynaptic connections
	if(allocateMem)
		dest->Npost = new unsigned short[networkConfigs[netId].numNAssigned];
	memcpy(&dest->Npost[posN], &src->Npost[posN], sizeof(short) * lengthN);

	// beginning position for the post-synaptic information
	if(allocateMem)
		dest->cumulativePost = new unsigned int[networkConfigs[netId].numNAssigned];
	memcpy(&dest->cumulativePost[posN], &src->cumulativePost[posN], sizeof(int) * lengthN);


	// Npost, cumulativePost has been copied to destination
	if (lGrpId == ALL) {
		lengthSyn = networkConfigs[netId].numPostSynNet;
		posSyn = 0;
	} else {
		lengthSyn = 0;
		for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++)
			lengthSyn += dest->Npost[lNId];

		posSyn = dest->cumulativePost[groupConfigs[netId][lGrpId].lStartN];
	}

	// actual post synaptic connection information...
	if(allocateMem)
		dest->postSynapticIds = new SynInfo[networkConfigs[netId].numPostSynNet];
	memcpy(&dest->postSynapticIds[posSyn], &src->postSynapticIds[posSyn], sizeof(SynInfo) * lengthSyn);

	// static specific mapping and actual post-synaptic delay metric
	if(allocateMem)
		dest->postDelayInfo = new DelayInfo[networkConfigs[netId].numNAssigned * (glbNetworkConfig.maxDelay + 1)];
	memcpy(&dest->postDelayInfo[posN * (glbNetworkConfig.maxDelay + 1)], &src->postDelayInfo[posN * (glbNetworkConfig.maxDelay + 1)], sizeof(DelayInfo) * lengthN * (glbNetworkConfig.maxDelay + 1));
}

/*!
 * \brief this function allocates memory sapce and copies variables related to syanpses to it
 *
 * This function:
 * (allocate and) copy wt, wtChange, maxSynWt
 *
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU
 * \since v4.0
 */
void SNN::copySynapseState(int netId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	assert(networkConfigs[netId].numPreSynNet > 0);

	// synaptic information based
	if(allocateMem)
		dest->wt = new float[networkConfigs[netId].numPreSynNet];
	memcpy(dest->wt, src->wt, sizeof(float) * networkConfigs[netId].numPreSynNet);

	// we don't need these data structures if the network doesn't have any plastic synapses at all
	// they show up in updateLTP() and updateSynapticWeights(), two functions that do not get called if
	// sim_with_fixedwts is set
	if (!sim_with_fixedwts) {
		// synaptic weight derivative
		if(allocateMem)
			dest->wtChange = new float[networkConfigs[netId].numPreSynNet];
		memcpy(dest->wtChange, src->wtChange, sizeof(float) * networkConfigs[netId].numPreSynNet);

		// synaptic weight maximum value
		if(allocateMem)
			dest->maxSynWt = new float[networkConfigs[netId].numPreSynNet];
		memcpy(dest->maxSynWt, src->maxSynWt, sizeof(float) * networkConfigs[netId].numPreSynNet);
	}
}

/*!
 * \brief this function allocates memory sapce and copies variables related to nueron state to it
 *
 * This function:
 * (allocate and) copy voltage, recovery, current, avgFiring
 *
 * This funcion is called by allocateSNN_CPU(). Only copying from host to device is required
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU fetchNeuronState
 * \since v3.0
 */
void SNN::copyNeuronState(int netId, int lGrpId, RuntimeData* dest, bool allocateMem) {
	int ptrPos, length;

	if(lGrpId == ALL) {
		ptrPos  = 0;
		length  = networkConfigs[netId].numNReg;
	}
	else {
		ptrPos  = groupConfigs[netId][lGrpId].lStartN;
		length  = groupConfigs[netId][lGrpId].numN;
	}

	assert(length <= networkConfigs[netId].numNReg);

	if (length == 0)
		return;

	if(!allocateMem && groupConfigs[netId][lGrpId].Type & POISSON_NEURON)
		return;

	if(allocateMem)
		dest->recovery = new float[length];
	memcpy(&dest->recovery[ptrPos], &managerRuntimeData.recovery[ptrPos], sizeof(float) * length);

	if(allocateMem)
		dest->voltage = new float[length];
	memcpy(&dest->voltage[ptrPos], &managerRuntimeData.voltage[ptrPos], sizeof(float) * length);

	if (allocateMem)
		dest->nextVoltage = new float[length];
	memcpy(&dest->nextVoltage[ptrPos], &managerRuntimeData.nextVoltage[ptrPos], sizeof(float) * length);

	//neuron input current...
	if(allocateMem)
		dest->current = new float[length];
	memcpy(&dest->current[ptrPos], &managerRuntimeData.current[ptrPos], sizeof(float) * length);

	if (sim_with_conductances) {
		//conductance information
		copyConductanceAMPA(netId, lGrpId, dest, &managerRuntimeData, allocateMem, 0);
		copyConductanceNMDA(netId, lGrpId, dest, &managerRuntimeData, allocateMem, 0);
		copyConductanceGABAa(netId, lGrpId, dest, &managerRuntimeData, allocateMem, 0);
		copyConductanceGABAb(netId, lGrpId, dest, &managerRuntimeData, allocateMem, 0);
	}

	// copying external current needs to be done separately because setExternalCurrent needs to call it, too
	// do it only from host to device
	copyExternalCurrent(netId, lGrpId, dest, allocateMem);

	if (allocateMem)
		dest->curSpike = new bool[length];
	memcpy(&dest->curSpike[ptrPos], &managerRuntimeData.curSpike[ptrPos], sizeof(bool) * length);

	copyNeuronParameters(netId, lGrpId, dest, allocateMem);

	if (networkConfigs[netId].sim_with_nm)
		copyNeuronStateBuffer(netId, lGrpId, dest, &managerRuntimeData, allocateMem);

	if (sim_with_homeostasis) {
		//Included to enable homeostasis in CPU_MODE.
		// Avg. Firing...
		if(allocateMem)
			dest->avgFiring = new float[length];
		memcpy(&dest->avgFiring[ptrPos], &managerRuntimeData.avgFiring[ptrPos], sizeof(float) * length);
	}
}

/*!
 * \brief this function allocates memory sapce and copies AMPA conductance to it
 *
 * This function:
 * (allocate and) copy gAMPA
 *
 * This funcion is called by copyNeuronState() and fetchConductanceAMPA(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copy
 * \param[in] destOffset the offset of data destination, which is used in local-to-global copy
 *
 * \sa copyNeuronState fetchConductanceAMPA
 * \since v3.0
 */
void SNN::copyConductanceAMPA(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem, int destOffset) {
	assert(isSimulationWithCOBA());

	int ptrPos, length;

	if(lGrpId == ALL) {
		ptrPos = 0;
		length = networkConfigs[netId].numNReg;
	} else {
		ptrPos = groupConfigs[netId][lGrpId].lStartN;
		length = groupConfigs[netId][lGrpId].numN;
	}
	assert(length <= networkConfigs[netId].numNReg);
	assert(length > 0);

	//conductance information
	assert(src->gAMPA  != NULL);
	if(allocateMem)
		dest->gAMPA = new float[length];
	memcpy(&dest->gAMPA[ptrPos + destOffset], &src->gAMPA[ptrPos], sizeof(float) * length);
}

/*!
 * \brief this function allocates memory sapce and copies NMDA conductance to it
 *
 * This function:
 * (allocate and) copy gNMDA, gNMDA_r, gNMDA_d
 *
 * This funcion is called by copyNeuronState() and fetchConductanceNMDA(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copy
 * \param[in] destOffset the offset of data destination, which is used in local-to-global copy
 *
 * \sa copyNeuronState fetchConductanceNMDA
 * \since v3.0
*/
void SNN::copyConductanceNMDA(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem, int destOffset) {
	assert(isSimulationWithCOBA());

	int ptrPos, length;

	if(lGrpId == ALL) {
		ptrPos  = 0;
		length  = networkConfigs[netId].numNReg;
	} else {
		ptrPos  = groupConfigs[netId][lGrpId].lStartN;
		length  = groupConfigs[netId][lGrpId].numN;
	}
	assert(length  <= networkConfigs[netId].numNReg);
	assert(length > 0);

	if (isSimulationWithNMDARise()) {
		assert(src->gNMDA_r != NULL);
		if(allocateMem)
			dest->gNMDA_r = new float[length];
		memcpy(&dest->gNMDA_r[ptrPos], &src->gNMDA_r[ptrPos], sizeof(float) * length);

		assert(src->gNMDA_d != NULL);
		if(allocateMem)
			dest->gNMDA_d = new float[length];
		memcpy(&dest->gNMDA_d[ptrPos], &src->gNMDA_d[ptrPos], sizeof(float) * length);
	} else {
		assert(src->gNMDA != NULL);
		if(allocateMem)
			dest->gNMDA = new float[length];
		memcpy(&dest->gNMDA[ptrPos + destOffset], &src->gNMDA[ptrPos], sizeof(float) * length);
	}
}

/*!
 * \brief this function allocates memory sapce and copies GABAa conductance to it
 *
 * This function:
 * (allocate and) copy gGABAa
 *
 * This funcion is called by copyNeuronState() and fetchConductanceGABAa(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copy
 * \param[in] destOffset the offset of data destination, which is used in local-to-global copy
 *
 * \sa copyNeuronState fetchConductanceGABAa
 * \since v3.0
 */
void SNN::copyConductanceGABAa(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem, int destOffset) {
	assert(isSimulationWithCOBA());

	int ptrPos, length;

	if(lGrpId == ALL) {
		ptrPos  = 0;
		length  = networkConfigs[netId].numNReg;
	} else {
		ptrPos  = groupConfigs[netId][lGrpId].lStartN;
		length  = groupConfigs[netId][lGrpId].numN;
	}
	assert(length  <= networkConfigs[netId].numNReg);
	assert(length > 0);

	assert(src->gGABAa != NULL);
	if(allocateMem)
		dest->gGABAa = new float[length];
	memcpy(&dest->gGABAa[ptrPos + destOffset], &src->gGABAa[ptrPos], sizeof(float) * length);
}

/*!
 * \brief this function allocates memory sapce and copies GABAb conductance to it
 *
 * This function:
 * (allocate and) copy gGABAb, gGABAb_r, gGABAb_d
 *
 * This funcion is called by copyNeuronState() and fetchConductanceGABAb(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copy
 * \param[in] destOffset the offset of data destination, which is used in local-to-global copy
 *
 * \sa copyNeuronState fetchConductanceGABAb
 * \since v3.0
 */
void SNN::copyConductanceGABAb(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem, int destOffset) {
	assert(isSimulationWithCOBA());

	int ptrPos, length;

	if (lGrpId == ALL) {
		ptrPos  = 0;
		length  = networkConfigs[netId].numNReg;
	} else {
		ptrPos  = groupConfigs[netId][lGrpId].lStartN;
		length  = groupConfigs[netId][lGrpId].numN;
	}
	assert(length <= networkConfigs[netId].numNReg);
	assert(length > 0);

	if (isSimulationWithGABAbRise()) {
		assert(src->gGABAb_r != NULL);
		if(allocateMem)
			dest->gGABAb_r = new float[length];
		memcpy(&dest->gGABAb_r[ptrPos], &src->gGABAb_r[ptrPos], sizeof(float) * length);

		assert(src->gGABAb_d != NULL);
		if(allocateMem)
			dest->gGABAb_d = new float[length];
		memcpy(&dest->gGABAb_d[ptrPos], &src->gGABAb_d[ptrPos], sizeof(float) * length);
	} else {
		assert(src->gGABAb != NULL);
		if(allocateMem)
			dest->gGABAb = new float[length];
		memcpy(&dest->gGABAb[ptrPos + destOffset], &src->gGABAb[ptrPos], sizeof(float) * length);
	}
}

/*!
* \brief This function fetch neuron state buffer in the local network specified by netId
*
* This function:
* (allocate and) copy
*
* This funcion is called by copyNeuronState()
*
* \param[in] netId the id of a local network, which is the same as the Core (CPU) id
* \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
* \param[in] dest pointer to runtime data desitnation
* \param[in] src pointer to runtime data source
* \param[in] allocateMem a flag indicates whether allocating memory space before copying
*
* \sa copyNeuronState
* \since v4.0
*/
void SNN::copyNeuronStateBuffer(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	int ptrPos, length;

	assert(src->nVBuffer != NULL);
	if (allocateMem) dest->nVBuffer = new float[networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * 1000];

	assert(src->nUBuffer != NULL);
	if (allocateMem) dest->nUBuffer = new float[networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * 1000];

	assert(src->nIBuffer != NULL);
	if (allocateMem) dest->nIBuffer = new float[networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * 1000];

	if (lGrpId == ALL) {
		ptrPos = 0;
		length = networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * 1000;

		// copy neuron information
		memcpy(&dest->nVBuffer[ptrPos], &src->nVBuffer[ptrPos], sizeof(float) * length);
		memcpy(&dest->nUBuffer[ptrPos], &src->nUBuffer[ptrPos], sizeof(float) * length);
		memcpy(&dest->nIBuffer[ptrPos], &src->nIBuffer[ptrPos], sizeof(float) * length);
	}
	else {
		for (int t = 0; t < 1000; t++) {
			ptrPos = networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * t + lGrpId * MAX_NEURON_MON_GRP_SZIE;
			length = MAX_NEURON_MON_GRP_SZIE;

			assert((ptrPos + length) <= networkConfigs[netId].numGroups * MAX_NEURON_MON_GRP_SZIE * 1000);
			assert(length > 0);

			// copy neuron information
			memcpy(&dest->nVBuffer[ptrPos], &src->nVBuffer[ptrPos], sizeof(float) * length);
			memcpy(&dest->nUBuffer[ptrPos], &src->nUBuffer[ptrPos], sizeof(float) * length);
			memcpy(&dest->nIBuffer[ptrPos], &src->nIBuffer[ptrPos], sizeof(float) * length);
		}
	}
}


/*!
 * \brief this function allocates memory sapce and copies external current to it
 *
 * This function:

 * (allocate and) copy extCurrent
 *
 * This funcion is called by copyNeuronState() and setExternalCurrent. Only host-to-divice copy is required
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU fetchSTPState
 * \since v3.0
*/
void SNN::copyExternalCurrent(int netId, int lGrpId, RuntimeData* dest, bool allocateMem) {
	int posN, lengthN;

	if(lGrpId == ALL) {
		posN  = 0;
		lengthN  = networkConfigs[netId].numNReg;
	} else {
		assert(lGrpId >= 0);
		posN = groupConfigs[netId][lGrpId].lStartN;
		lengthN = groupConfigs[netId][lGrpId].numN;
	}
	assert(lengthN >= 0 && lengthN <= networkConfigs[netId].numNReg); // assert NOT poisson neurons

	KERNEL_DEBUG("copyExternalCurrent: lGrpId=%d, ptrPos=%d, length=%d, allocate=%s", lGrpId, posN, lengthN, allocateMem?"y":"n");

	if(allocateMem)
		dest->extCurrent = new float[lengthN];
	memcpy(&(dest->extCurrent[posN]), &(managerRuntimeData.extCurrent[posN]), sizeof(float) * lengthN);
}

/*!
 * \brief this function allocates memory sapce and copies neural parameters to it
 *
 * This function:
 * (allocate and) copy Izh_a, Izh_b, Izh_c, Izh_d
 * initialize baseFiringInv
 * (allocate and) copy baseFiring, baseFiringInv
 *
 * This funcion is only called by copyNeuronState(). Only copying direction from host to device is required.
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa copyNeuronState
 * \since v3.0
 */
void SNN::copyNeuronParameters(int netId, int lGrpId, RuntimeData* dest, bool allocateMem) {
	int ptrPos, length;

	// when allocating we are allocating the memory.. we need to do it completely... to avoid memory fragmentation..
	if (allocateMem) {
		assert(lGrpId == ALL);
		assert(dest->Izh_a == NULL);
		assert(dest->Izh_b == NULL);
		assert(dest->Izh_c == NULL);
		assert(dest->Izh_d == NULL);
		assert(dest->Izh_C == NULL);
		assert(dest->Izh_k == NULL);
		assert(dest->Izh_vr == NULL);
		assert(dest->Izh_vt == NULL);
		assert(dest->Izh_vpeak == NULL);
		assert(dest->lif_tau_m == NULL);
		assert(dest->lif_tau_ref == NULL);
		assert(dest->lif_tau_ref_c == NULL);
		assert(dest->lif_vTh == NULL);
		assert(dest->lif_vReset == NULL);
		assert(dest->lif_gain == NULL);
		assert(dest->lif_bias == NULL);
	}

	if(lGrpId == ALL) {
		ptrPos = 0;
		length = networkConfigs[netId].numNReg;
	}
	else {
		ptrPos = groupConfigs[netId][lGrpId].lStartN;
		length = groupConfigs[netId][lGrpId].numN;
	}

	if(allocateMem)
		dest->Izh_a = new float[length];
	memcpy(&dest->Izh_a[ptrPos], &(managerRuntimeData.Izh_a[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->Izh_b = new float[length];
	memcpy(&dest->Izh_b[ptrPos], &(managerRuntimeData.Izh_b[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->Izh_c = new float[length];
	memcpy(&dest->Izh_c[ptrPos], &(managerRuntimeData.Izh_c[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->Izh_d = new float[length];
	memcpy(&dest->Izh_d[ptrPos], &(managerRuntimeData.Izh_d[ptrPos]), sizeof(float) * length);

	if (allocateMem)
		dest->Izh_C = new float[length];
	memcpy(&dest->Izh_C[ptrPos], &(managerRuntimeData.Izh_C[ptrPos]), sizeof(float) * length);

	if (allocateMem)
		dest->Izh_k = new float[length];
	memcpy(&dest->Izh_k[ptrPos], &(managerRuntimeData.Izh_k[ptrPos]), sizeof(float) * length);

	if (allocateMem)
		dest->Izh_vr = new float[length];
	memcpy(&dest->Izh_vr[ptrPos], &(managerRuntimeData.Izh_vr[ptrPos]), sizeof(float) * length);

	if (allocateMem)
		dest->Izh_vt = new float[length];
	memcpy(&dest->Izh_vt[ptrPos], &(managerRuntimeData.Izh_vt[ptrPos]), sizeof(float) * length);

	if (allocateMem)
		dest->Izh_vpeak = new float[length];
	memcpy(&dest->Izh_vpeak[ptrPos], &(managerRuntimeData.Izh_vpeak[ptrPos]), sizeof(float) * length);

	//LIF neuron
	if(allocateMem)
		dest->lif_tau_m = new int[length];
	memcpy(&dest->lif_tau_m[ptrPos], &(managerRuntimeData.lif_tau_m[ptrPos]), sizeof(int) * length);

	if(allocateMem)
		dest->lif_tau_ref = new int[length];
	memcpy(&dest->lif_tau_ref[ptrPos], &(managerRuntimeData.lif_tau_ref[ptrPos]), sizeof(int) * length);

	if(allocateMem)
		dest->lif_tau_ref_c = new int[length];
	memcpy(&dest->lif_tau_ref_c[ptrPos], &(managerRuntimeData.lif_tau_ref_c[ptrPos]), sizeof(int) * length);

	if(allocateMem)
		dest->lif_vTh = new float[length];
	memcpy(&dest->lif_vTh[ptrPos], &(managerRuntimeData.lif_vTh[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->lif_vReset = new float[length];
	memcpy(&dest->lif_vReset[ptrPos], &(managerRuntimeData.lif_vReset[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->lif_gain = new float[length];
	memcpy(&dest->lif_gain[ptrPos], &(managerRuntimeData.lif_gain[ptrPos]), sizeof(float) * length);

	if(allocateMem)
		dest->lif_bias = new float[length];
	memcpy(&dest->lif_bias[ptrPos], &(managerRuntimeData.lif_bias[ptrPos]), sizeof(float) * length);

	// pre-compute baseFiringInv for fast computation on CPU cores
	if (sim_with_homeostasis) {
		float* baseFiringInv = new float[length];
		for(int nid = 0; nid < length; nid++) {
			if (managerRuntimeData.baseFiring[nid] != 0.0f)
				baseFiringInv[nid] = 1.0f / managerRuntimeData.baseFiring[ptrPos + nid];
			else
				baseFiringInv[nid] = 0.0;
		}

		if(allocateMem)
			dest->baseFiringInv = new float[length];
		memcpy(&dest->baseFiringInv[ptrPos], baseFiringInv, sizeof(float) * length);

		if(allocateMem)
			dest->baseFiring = new float[length];
		memcpy(&dest->baseFiring[ptrPos], managerRuntimeData.baseFiring, sizeof(float) * length);

		delete [] baseFiringInv;
	}
}

/*!
 * \brief this function allocates memory sapce and copies short-term plasticity (STP) state to it
 *
 * This function:
 * initialize STP_Pitch
 * (allocate and) copy stpu, stpx
 *
 * This funcion is called by allocateSNN_CPU() and fetchSTPState(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU fetchSTPState
 * \since v3.0
 */
void SNN::copySTPState(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	// STP feature is optional, do addtional check for memory space
	if(allocateMem) {
		assert(dest->stpu == NULL);
		assert(dest->stpx == NULL);
	} else {
		assert(dest->stpu != NULL);
		assert(dest->stpx != NULL);
	}
	assert(src->stpu != NULL); assert(src->stpx != NULL);

	if(allocateMem)
		dest->stpu = new float[networkConfigs[netId].numN * (networkConfigs[netId].maxDelay + 1)];
	memcpy(dest->stpu, src->stpu, sizeof(float) * networkConfigs[netId].numN * (networkConfigs[netId].maxDelay + 1));

	if(allocateMem)
		dest->stpx = new float[networkConfigs[netId].numN * (networkConfigs[netId].maxDelay + 1)];
	memcpy(dest->stpx, src->stpx, sizeof(float) * networkConfigs[netId].numN * (networkConfigs[netId].maxDelay + 1));
}

// ToDo: move grpDA(5HT, ACh, NE)Buffer to copyAuxiliaryData
/*!
 * \brief this function allocates memory sapce and copies variables related to group state to it
 *
 * This function:
 * (allocate and) copy grpDA, grp5HT, grpACh, grpNE, grpDABuffer, grp5HTBuffer, grpAChBuffer, grpNEBuffer
 *
 * This funcion is called by allocateSNN_CPU() and fetchGroupState(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU fetchGroupState
 * \since v3.0
 */
void SNN::copyGroupState(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem) {
	if (allocateMem) {
		assert(dest->memType == CPU_MEM && !dest->allocated);
		dest->grpDA = new float[networkConfigs[netId].numGroups];
		dest->grp5HT = new float[networkConfigs[netId].numGroups];
		dest->grpACh = new float[networkConfigs[netId].numGroups];
		dest->grpNE = new float[networkConfigs[netId].numGroups];
	}
	memcpy(dest->grpDA, src->grpDA, sizeof(float) * networkConfigs[netId].numGroups);
	memcpy(dest->grp5HT, src->grp5HT, sizeof(float) * networkConfigs[netId].numGroups);
	memcpy(dest->grpACh, src->grpACh, sizeof(float) * networkConfigs[netId].numGroups);
	memcpy(dest->grpNE, src->grpNE, sizeof(float) * networkConfigs[netId].numGroups);

	if (lGrpId == ALL) {
		if (allocateMem) {
			assert(dest->memType == CPU_MEM && !dest->allocated);
			dest->grpDABuffer = new float[1000 * networkConfigs[netId].numGroups];
			dest->grp5HTBuffer = new float[1000 * networkConfigs[netId].numGroups];
			dest->grpAChBuffer = new float[1000 * networkConfigs[netId].numGroups];
			dest->grpNEBuffer = new float[1000 * networkConfigs[netId].numGroups];
		}
		memcpy(dest->grpDABuffer, src->grpDABuffer, sizeof(float) * 1000 * networkConfigs[netId].numGroups);
		memcpy(dest->grp5HTBuffer, src->grp5HTBuffer, sizeof(float) * 1000 * networkConfigs[netId].numGroups);
		memcpy(dest->grpAChBuffer, src->grpAChBuffer, sizeof(float) * 1000 * networkConfigs[netId].numGroups);
		memcpy(dest->grpNEBuffer, src->grpNEBuffer, sizeof(float) * 1000 * networkConfigs[netId].numGroups);
	} else {
		assert(!allocateMem);
		memcpy(&dest->grpDABuffer[lGrpId * 1000], &src->grpDABuffer[lGrpId * 1000], sizeof(float) * 1000);
		memcpy(&dest->grp5HTBuffer[lGrpId * 1000], &src->grp5HTBuffer[lGrpId * 1000], sizeof(float) * 1000);
		memcpy(&dest->grpAChBuffer[lGrpId * 1000], &src->grpAChBuffer[lGrpId * 1000], sizeof(float) * 1000);
		memcpy(&dest->grpNEBuffer[lGrpId * 1000], &src->grpNEBuffer[lGrpId * 1000], sizeof(float) * 1000);
	}
}

/*!
 * \brief this function allocates memory sapce and copies auxiliary runtime data to it
 *
 * This function:
 * (allocate and) reset spikeGenBits, poissonFireRate
 * initialize I_setLength, I_setPitch; (allocate and) reset I_set
 * (allocate and) copy synSpikeTime, lastSpikeTime
 * (allocate and) copy nSpikeCnt
 * (allocate and) copy grpIds, connIdsPreIdx
 * (allocate and) copy timeTableD1, timeTableD2
 * (allocate and) copy firingTableD1, firingTableD2
 * This funcion is only called by allocateSNN_CPU. Therefore, only copying direction from host to device is required
 *
 * \param[in] netId the id of local network, which is the same as Core (CPU) id
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] allocateMem a flag indicates whether allocating memory space before copying
 *
 * \sa allocateSNN_CPU
 * \since v4.0
 */
void SNN::copyAuxiliaryData(int netId, int lGrpId, RuntimeData* dest, bool allocateMem) {
	assert(networkConfigs[netId].numN > 0);

	if(allocateMem)
		dest->spikeGenBits = new unsigned int[networkConfigs[netId].numNSpikeGen / 32 + 1];
	memset(dest->spikeGenBits, 0, sizeof(int) * (networkConfigs[netId].numNSpikeGen / 32 + 1));

	// allocate the poisson neuron poissonFireRate
	if(allocateMem)
		dest->poissonFireRate = new float[networkConfigs[netId].numNPois];
	memset(dest->poissonFireRate, 0, sizeof(float) * networkConfigs[netId].numNPois);

	// synaptic auxiliary data
	// I_set: a bit vector indicates which synapse got a spike
	if(allocateMem) {
		networkConfigs[netId].I_setLength = ceil(((networkConfigs[netId].maxNumPreSynN) / 32.0f));
		dest->I_set = new int[networkConfigs[netId].numNReg * networkConfigs[netId].I_setLength];
	}
	assert(networkConfigs[netId].maxNumPreSynN >= 0);
	memset(dest->I_set, 0, sizeof(int) * networkConfigs[netId].numNReg * networkConfigs[netId].I_setLength);

	// synSpikeTime: an array indicates the last time when a synapse got a spike
	if(allocateMem)
		dest->synSpikeTime = new int[networkConfigs[netId].numPreSynNet];
	memcpy(dest->synSpikeTime, managerRuntimeData.synSpikeTime, sizeof(int) * networkConfigs[netId].numPreSynNet);

	// neural auxiliary data
	// lastSpikeTime: an array indicates the last time of a neuron emitting a spike
	// neuron firing time
	if(allocateMem)
		dest->lastSpikeTime = new int[networkConfigs[netId].numNAssigned];
	memcpy(dest->lastSpikeTime, managerRuntimeData.lastSpikeTime, sizeof(int) * networkConfigs[netId].numNAssigned);

	// auxiliary data for recording spike count of each neuron
	copyNeuronSpikeCount(netId, lGrpId, dest, &managerRuntimeData, true, 0);

	// quick lookup array for local group ids
	if(allocateMem)
		dest->grpIds = new short int[networkConfigs[netId].numNAssigned];
	memcpy(dest->grpIds, managerRuntimeData.grpIds, sizeof(short int) * networkConfigs[netId].numNAssigned);

	// quick lookup array for conn ids
	if(allocateMem)
		dest->connIdsPreIdx = new short int[networkConfigs[netId].numPreSynNet];
	memcpy(dest->connIdsPreIdx, managerRuntimeData.connIdsPreIdx, sizeof(short int) * networkConfigs[netId].numPreSynNet);

	// reset variable related to spike count
	// Note: the GPU counterpart is not required to do this
	dest->spikeCountSec = 0;
	dest->spikeCountD1Sec = 0;
	dest->spikeCountD2Sec = 0;
	dest->spikeCountExtRxD1Sec = 0;
	dest->spikeCountExtRxD2Sec = 0;
	dest->spikeCountLastSecLeftD2 = 0;
	dest->spikeCount = 0;
	dest->spikeCountD1 = 0;
	dest->spikeCountD2 = 0;
	dest->nPoissonSpikes = 0;
	dest->spikeCountExtRxD1 = 0;
	dest->spikeCountExtRxD2 = 0;

	// time talbe
	// Note: the GPU counterpart is not required to do this
	if (allocateMem) {
		assert(dest->timeTableD1 == NULL);
		assert(dest->timeTableD2 == NULL);
	}

	if (allocateMem)
		dest->timeTableD1 = new unsigned int[TIMING_COUNT];
	memset(dest->timeTableD1, 0, sizeof(int) * TIMING_COUNT);

	if (allocateMem)
		dest->timeTableD2 = new unsigned int[TIMING_COUNT];
	memset(dest->timeTableD2, 0, sizeof(int) * TIMING_COUNT);

	// firing table
	if (allocateMem) {
		assert(dest->firingTableD1 == NULL);
		assert(dest->firingTableD2 == NULL);
	}

	// allocate 1ms firing table
	if (allocateMem)
		dest->firingTableD1 = new int[networkConfigs[netId].maxSpikesD1];
	if (networkConfigs[netId].maxSpikesD1 > 0)
		memcpy(dest->firingTableD1, managerRuntimeData.firingTableD1, sizeof(int) * networkConfigs[netId].maxSpikesD1);

	// allocate 2+ms firing table
	if(allocateMem)
		dest->firingTableD2 = new int[networkConfigs[netId].maxSpikesD2];
	if (networkConfigs[netId].maxSpikesD2 > 0)
		memcpy(dest->firingTableD2, managerRuntimeData.firingTableD2, sizeof(int) * networkConfigs[netId].maxSpikesD2);

	// allocate external 1ms firing table
	if (allocateMem) {
		dest->extFiringTableD1 = new int*[networkConfigs[netId].numGroups];
		memset(dest->extFiringTableD1, 0 /* NULL */, sizeof(int*) * networkConfigs[netId].numGroups);
		for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
			if (groupConfigs[netId][lGrpId].hasExternalConnect) {
				dest->extFiringTableD1[lGrpId] = new int[groupConfigs[netId][lGrpId].numN * NEURON_MAX_FIRING_RATE];
				memset(dest->extFiringTableD1[lGrpId], 0, sizeof(int) * groupConfigs[netId][lGrpId].numN * NEURON_MAX_FIRING_RATE);
			}
		}
	}

	// allocate external 2+ms firing table
	if (allocateMem) {
		dest->extFiringTableD2 = new int*[networkConfigs[netId].numGroups];
		memset(dest->extFiringTableD2, 0 /* NULL */, sizeof(int*) * networkConfigs[netId].numGroups);
		for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
			if (groupConfigs[netId][lGrpId].hasExternalConnect) {
				dest->extFiringTableD2[lGrpId] = new int[groupConfigs[netId][lGrpId].numN * NEURON_MAX_FIRING_RATE];
				memset(dest->extFiringTableD2[lGrpId], 0, sizeof(int) * groupConfigs[netId][lGrpId].numN * NEURON_MAX_FIRING_RATE);
			}
		}
	}

	// allocate external 1ms firing table index
	if (allocateMem)
		dest->extFiringTableEndIdxD1 = new int[networkConfigs[netId].numGroups];
	memset(dest->extFiringTableEndIdxD1, 0, sizeof(int) * networkConfigs[netId].numGroups);


	// allocate external 2+ms firing table index
	if (allocateMem)
		dest->extFiringTableEndIdxD2 = new int[networkConfigs[netId].numGroups];
	memset(dest->extFiringTableEndIdxD2, 0, sizeof(int) * networkConfigs[netId].numGroups);
}

/*!
 * \brief this function allocates memory sapce and copies the spike count of each neuron to it
 *
 * This function:
 * (allocate and) copy nSpikeCnt
 *
 * This funcion is called by copyAuxiliaryData() and fetchNeuronSpikeCount(). It supports bi-directional copying
 *
 * \param[in] netId the id of a local network, which is the same as the Core (CPU) id
 * \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
 * \param[in] dest pointer to runtime data desitnation
 * \param[in] src pointer to runtime data source
 * \param[in] allocateMem a flag indicates whether allocating memory space before copy
 * \param[in] destOffset the offset of data destination, which is used in local-to-global copy
 *
 * \sa copyAuxiliaryData fetchNeuronSpikeCount
 * \since v4.0
 */
void SNN::copyNeuronSpikeCount(int netId, int lGrpId, RuntimeData* dest, RuntimeData* src, bool allocateMem, int destOffset) {
	int posN, lengthN;

	if(lGrpId == ALL) {
		posN = 0;
		lengthN = networkConfigs[netId].numN;
	} else {
		posN = groupConfigs[netId][lGrpId].lStartN;
		lengthN = groupConfigs[netId][lGrpId].numN;
	}
	assert(lengthN > 0 && lengthN <= networkConfigs[netId].numN);

	// spike count information
	if(allocateMem)
		dest->nSpikeCnt = new int[lengthN];
	memcpy(&dest->nSpikeCnt[posN + destOffset], &src->nSpikeCnt[posN], sizeof(int) * lengthN);
}


#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::assignPoissonFiringRate_CPU(int netId) {
#else // POSIX
	void* SNN::assignPoissonFiringRate_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);

	for (int lGrpId = 0; lGrpId < networkConfigs[netId].numGroups; lGrpId++) {
		// given group of neurons belong to the poisson group....
		if (groupConfigs[netId][lGrpId].isSpikeGenerator) {
			int lNId = groupConfigs[netId][lGrpId].lStartN;
			int gGrpId = groupConfigs[netId][lGrpId].gGrpId;
			PoissonRate* rate = groupConfigMDMap[gGrpId].ratePtr;

			// if spikeGenFunc group does not have a Poisson pointer, skip
			if (groupConfigMap[gGrpId].spikeGenFunc || rate == NULL)
				continue;

			assert(runtimeData[netId].poissonFireRate != NULL);
			assert(rate->isOnGPU() == false);
			// rates allocated on CPU
			memcpy(&runtimeData[netId].poissonFireRate[lNId - networkConfigs[netId].numNReg], rate->getRatePtrCPU(),
					sizeof(float) * rate->getNumNeurons());
		}
	}
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperAssignPoissonFiringRate_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> assignPoissonFiringRate_CPU(args->netId);
		pthread_exit(0);
	}
#endif

/*!
* \brief this function copy weight state in core (CPU) memory sapce to manager (CPU) memory space
*
* This function:
* copy wt, wtChange synSpikeTime
*
* This funcion is only called by fetchWeightState(). Only copying direction from device to host is required.
*
* \param[in] netId the id of a local network, which is the same as the device (GPU) id
* \param[in] lGrpId the local group id in a local network, which specifiy the group(s) to be copied
*
* \sa fetchWeightState
* \since v4.0
*/
void SNN::copyWeightState(int netId, int lGrpId) {
	int lengthSyn, posSyn;

	// first copy pre-connections info
	copyPreConnectionInfo(netId, lGrpId, &managerRuntimeData, &runtimeData[netId], false);

	if (lGrpId == ALL) {
		lengthSyn = networkConfigs[netId].numPreSynNet;
		posSyn = 0;
	}
	else {
		lengthSyn = 0;
		for (int lNId = groupConfigs[netId][lGrpId].lStartN; lNId <= groupConfigs[netId][lGrpId].lEndN; lNId++)
			lengthSyn += managerRuntimeData.Npre[lNId];

		posSyn = managerRuntimeData.cumulativePre[groupConfigs[netId][lGrpId].lStartN];
	}

	assert(posSyn < networkConfigs[netId].numPreSynNet || networkConfigs[netId].numPreSynNet == 0);
	assert(lengthSyn <= networkConfigs[netId].numPreSynNet);

	memcpy(&managerRuntimeData.wt[posSyn], &runtimeData[netId].wt[posSyn], sizeof(float) * lengthSyn);

	// copy firing time for individual synapses
	//CUDA_CHECK_ERRORS(cudaMemcpy(&managerRuntimeData.synSpikeTime[cumPos_syn], &runtimeData[netId].synSpikeTime[cumPos_syn], sizeof(int) * length_wt, cudaMemcpyDeviceToHost));

	if ((!sim_with_fixedwts) || sim_with_stdp) {
		// copy synaptic weight derivative
		memcpy(&managerRuntimeData.wtChange[posSyn], &runtimeData[netId].wtChange[posSyn], sizeof(float) * lengthSyn);
	}
}

void SNN::copyNetworkConfig(int netId) {
	// do nothing, CPU computing backend can access networkConfigs[] directly
}

void SNN::copyGrpIdsLookupArray(int netId) {
	memcpy(managerRuntimeData.grpIds, runtimeData[netId].grpIds, sizeof(short int) *  networkConfigs[netId].numNAssigned);
}

void SNN::copyConnIdsLookupArray(int netId) {
	memcpy(managerRuntimeData.connIdsPreIdx, runtimeData[netId].connIdsPreIdx, sizeof(short int) *  networkConfigs[netId].numPreSynNet);
}

void SNN::copyLastSpikeTime(int netId) {
	memcpy(managerRuntimeData.lastSpikeTime, runtimeData[netId].lastSpikeTime, sizeof(int) *  networkConfigs[netId].numN);
}

/*!
* \brief This function fetch the spike count in all local networks and sum the up
*/
void SNN::copyNetworkSpikeCount(int netId,
	unsigned int* spikeCountD1, unsigned int* spikeCountD2,
	unsigned int* spikeCountExtD1, unsigned int* spikeCountExtD2) {

	*spikeCountExtD2 = runtimeData[netId].spikeCountExtRxD2;
	*spikeCountExtD1 = runtimeData[netId].spikeCountExtRxD1;
	*spikeCountD2 = runtimeData[netId].spikeCountD2;
	*spikeCountD1 = runtimeData[netId].spikeCountD1;
}

/*!
* \brief This function fetch spikeTables in the local network specified by netId
*
* \param[in] netId the id of local network of which timeTableD1(D2) and firingTableD1(D2) are copied to manager runtime data
*/
void SNN::copySpikeTables(int netId) {
	unsigned int spikeCountD1Sec, spikeCountD2Sec, spikeCountLastSecLeftD2;

	spikeCountLastSecLeftD2 = runtimeData[netId].spikeCountLastSecLeftD2;
	spikeCountD2Sec = runtimeData[netId].spikeCountD2Sec;
	spikeCountD1Sec = runtimeData[netId].spikeCountD1Sec;
	memcpy(managerRuntimeData.firingTableD2, runtimeData[netId].firingTableD2, sizeof(int) * (spikeCountD2Sec + spikeCountLastSecLeftD2));
	memcpy(managerRuntimeData.firingTableD1, runtimeData[netId].firingTableD1, sizeof(int) * spikeCountD1Sec);
	memcpy(managerRuntimeData.timeTableD2, runtimeData[netId].timeTableD2, sizeof(int) * (1000 + networkConfigs[netId].maxDelay + 1));
	memcpy(managerRuntimeData.timeTableD1, runtimeData[netId].timeTableD1, sizeof(int) * (1000 + networkConfigs[netId].maxDelay + 1));
}

#if defined(WIN32) || defined(WIN64) || defined(__APPLE__)
	void SNN::deleteRuntimeData_CPU(int netId) {
#else // POSIX
	void* SNN::deleteRuntimeData_CPU(int netId) {
#endif
	assert(runtimeData[netId].memType == CPU_MEM);
	// free all pointers
	delete [] runtimeData[netId].voltage;
	delete [] runtimeData[netId].nextVoltage;
	delete [] runtimeData[netId].recovery;
	delete [] runtimeData[netId].current;
	delete [] runtimeData[netId].extCurrent;
	delete [] runtimeData[netId].curSpike;
	delete [] runtimeData[netId].Npre;
	delete [] runtimeData[netId].Npre_plastic;
	delete [] runtimeData[netId].Npre_plasticInv;
	delete [] runtimeData[netId].Npost;
	delete [] runtimeData[netId].cumulativePost;
	delete [] runtimeData[netId].cumulativePre;
	delete [] runtimeData[netId].synSpikeTime;
	delete [] runtimeData[netId].wt;
	delete [] runtimeData[netId].wtChange;
	delete [] runtimeData[netId].maxSynWt;
	delete [] runtimeData[netId].nSpikeCnt;
	delete [] runtimeData[netId].avgFiring;
	delete [] runtimeData[netId].baseFiring;
	delete [] runtimeData[netId].baseFiringInv;

	delete [] runtimeData[netId].grpDA;
	delete [] runtimeData[netId].grp5HT;
	delete [] runtimeData[netId].grpACh;
	delete [] runtimeData[netId].grpNE;

	delete [] runtimeData[netId].grpDABuffer;
	delete [] runtimeData[netId].grp5HTBuffer;
	delete [] runtimeData[netId].grpAChBuffer;
	delete [] runtimeData[netId].grpNEBuffer;

	if (networkConfigs[netId].sim_with_nm) {
		delete[] runtimeData[netId].nVBuffer;
		delete[] runtimeData[netId].nUBuffer;
		delete[] runtimeData[netId].nIBuffer;
	}

	delete [] runtimeData[netId].grpIds;

	delete [] runtimeData[netId].Izh_a;
	delete [] runtimeData[netId].Izh_b;
	delete [] runtimeData[netId].Izh_c;
	delete [] runtimeData[netId].Izh_d;
	delete [] runtimeData[netId].Izh_C;
	delete [] runtimeData[netId].Izh_k;
	delete [] runtimeData[netId].Izh_vr;
	delete [] runtimeData[netId].Izh_vt;
	delete [] runtimeData[netId].Izh_vpeak;

	delete [] runtimeData[netId].lif_tau_m;
	delete [] runtimeData[netId].lif_tau_ref;
	delete [] runtimeData[netId].lif_tau_ref_c;
	delete [] runtimeData[netId].lif_vTh;
	delete [] runtimeData[netId].lif_vReset;
	delete [] runtimeData[netId].lif_gain;
	delete [] runtimeData[netId].lif_bias;

	delete [] runtimeData[netId].gAMPA;
	if (sim_with_NMDA_rise) {
		delete [] runtimeData[netId].gNMDA_r;
		delete [] runtimeData[netId].gNMDA_d;
	}
	else {
		delete [] runtimeData[netId].gNMDA;
	}
	delete [] runtimeData[netId].gGABAa;
	if (sim_with_GABAb_rise) {
		delete [] runtimeData[netId].gGABAb_r;
		delete [] runtimeData[netId].gGABAb_d;
	}
	else {
		delete [] runtimeData[netId].gGABAb;
	}

	delete [] runtimeData[netId].stpu;
	delete [] runtimeData[netId].stpx;

	delete [] runtimeData[netId].connIdsPreIdx;

	delete [] runtimeData[netId].postDelayInfo;
	delete [] runtimeData[netId].postSynapticIds;
	delete [] runtimeData[netId].preSynapticIds;
	delete [] runtimeData[netId].I_set;
	delete [] runtimeData[netId].poissonFireRate;
	delete [] runtimeData[netId].lastSpikeTime;
	delete [] runtimeData[netId].spikeGenBits;

	delete [] runtimeData[netId].timeTableD1;
	delete [] runtimeData[netId].timeTableD2;

	delete [] runtimeData[netId].firingTableD2;
	delete [] runtimeData[netId].firingTableD1;

	int** tempPtrs;
	tempPtrs = new int*[networkConfigs[netId].numGroups];

	// fetch device memory address stored in extFiringTableD2
	memcpy(tempPtrs, runtimeData[netId].extFiringTableD2, sizeof(int*) * networkConfigs[netId].numGroups);
	for (int i = 0; i < networkConfigs[netId].numGroups; i++)
		delete [] tempPtrs[i];
	delete [] runtimeData[netId].extFiringTableD2;

	// fetch device memory address stored in extFiringTableD1
	memcpy(tempPtrs, runtimeData[netId].extFiringTableD1, sizeof(int*) * networkConfigs[netId].numGroups);
	for (int i = 0; i < networkConfigs[netId].numGroups; i++)
		delete [] tempPtrs[i];
	delete [] runtimeData[netId].extFiringTableD1;

	delete [] tempPtrs;

	delete [] runtimeData[netId].extFiringTableEndIdxD2;
	delete [] runtimeData[netId].extFiringTableEndIdxD1;

	if (runtimeData[netId].randNum != NULL) delete [] runtimeData[netId].randNum;
	runtimeData[netId].randNum = NULL;
}

#if !defined(WIN32) && !defined(WIN64) && !defined(__APPLE__) // Linux or MAC
	// Static multithreading subroutine method - helper for the above method
	void* SNN::helperDeleteRuntimeData_CPU(void* arguments) {
		ThreadStruct* args = (ThreadStruct*) arguments;
		//printf("\nThread ID: %lu and CPU: %d\n",pthread_self(), sched_getcpu());
		((SNN *)args->snn_pointer) -> deleteRuntimeData_CPU(args->netId);
		pthread_exit(0);
	}
#endif
